#ifndef CUFFTDX_FFT_1024_FP32_INV_PTX_HPP
#define CUFFTDX_FFT_1024_FP32_INV_PTX_HPP



template<> __forceinline__ __device__ void cufftdx_private_function<285, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<1873>;
.reg .b32 r<24>;
.reg .b64 rd<9>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 13;
mov.u32 r3, %64;
add.s32 r4, r3, r2;
add.f32 f129, %66, %98;
sub.f32 f131, %66, %98;
add.f32 f1868, %67, %130;
sub.f32 f132, %67, %130;
add.f32 f133, %82, %114;
sub.f32 f135, %82, %114;
add.f32 f1866, %131, %115;
sub.f32 f136, %131, %115;
add.f32 f137, f129, f133;
sub.f32 f139, f129, f133;
add.f32 f1865, f1868, f1866;
sub.f32 f140, f1868, f1866;
sub.f32 f141, f131, f136;
add.f32 f143, f131, f136;
add.f32 f1864, f132, f135;
sub.f32 f144, f132, f135;
add.f32 f145, %74, %106;
sub.f32 f147, %74, %106;
add.f32 f1861, %132, %133;
sub.f32 f148, %132, %133;
add.f32 f149, %90, %122;
sub.f32 f151, %90, %122;
add.f32 f1859, %91, %134;
sub.f32 f152, %91, %134;
add.f32 f153, f145, f149;
sub.f32 f155, f145, f149;
add.f32 f1858, f1861, f1859;
sub.f32 f156, f1861, f1859;
sub.f32 f157, f147, f152;
add.f32 f159, f147, f152;
add.f32 f1857, f148, f151;
sub.f32 f160, f148, f151;
mul.f32 f161, f157, 0f3F3504F3;
mul.f32 f162, f1857, 0f3F3504F3;
sub.f32 f163, f161, f162;
add.f32 f164, f161, f162;
mul.f32 f1855, f159, 0fBF3504F3;
mul.f32 f1856, f160, 0f3F3504F3;
sub.f32 f167, f1855, f1856;
mul.f32 f168, f160, 0fBF3504F3;
fma.rn.f32 f169, f159, 0f3F3504F3, f168;
add.f32 f170, f137, f153;
sub.f32 f172, f137, f153;
add.f32 f1854, f1865, f1858;
sub.f32 f173, f1865, f1858;
add.f32 f174, f141, f163;
sub.f32 f176, f141, f163;
add.f32 f1853, f1864, f164;
sub.f32 f177, f1864, f164;
sub.f32 f178, f139, f156;
add.f32 f180, f139, f156;
add.f32 f1852, f140, f155;
sub.f32 f181, f140, f155;
add.f32 f182, f143, f167;
sub.f32 f184, f143, f167;
add.f32 f1851, f144, f169;
sub.f32 f185, f144, f169;
add.f32 f186, %70, %102;
sub.f32 f188, %70, %102;
add.f32 f1849, %135, %103;
sub.f32 f189, %135, %103;
add.f32 f190, %86, %118;
sub.f32 f192, %86, %118;
add.f32 f1846, %137, %136;
sub.f32 f193, %137, %136;
add.f32 f194, f186, f190;
sub.f32 f196, f186, f190;
add.f32 f1845, f1849, f1846;
sub.f32 f197, f1849, f1846;
sub.f32 f198, f188, f193;
add.f32 f200, f188, f193;
add.f32 f1844, f189, f192;
sub.f32 f201, f189, f192;
add.f32 f202, %78, %110;
sub.f32 f204, %78, %110;
add.f32 f1842, %79, %138;
sub.f32 f205, %79, %138;
add.f32 f206, %94, %126;
sub.f32 f208, %94, %126;
add.f32 f1840, %139, %127;
sub.f32 f209, %139, %127;
add.f32 f210, f202, f206;
sub.f32 f212, f202, f206;
add.f32 f1839, f1842, f1840;
sub.f32 f213, f1842, f1840;
sub.f32 f214, f204, f209;
add.f32 f216, f204, f209;
add.f32 f1838, f205, f208;
sub.f32 f217, f205, f208;
mul.f32 f218, f214, 0f3F3504F3;
mul.f32 f219, f1838, 0f3F3504F3;
sub.f32 f220, f218, f219;
add.f32 f221, f218, f219;
mul.f32 f1836, f216, 0fBF3504F3;
mul.f32 f1837, f217, 0f3F3504F3;
sub.f32 f224, f1836, f1837;
mul.f32 f225, f217, 0fBF3504F3;
fma.rn.f32 f226, f216, 0f3F3504F3, f225;
add.f32 f227, f194, f210;
sub.f32 f229, f194, f210;
add.f32 f1835, f1845, f1839;
sub.f32 f230, f1845, f1839;
add.f32 f231, f198, f220;
sub.f32 f233, f198, f220;
add.f32 f1834, f1844, f221;
sub.f32 f234, f1844, f221;
sub.f32 f235, f196, f213;
add.f32 f237, f196, f213;
add.f32 f1833, f197, f212;
sub.f32 f238, f197, f212;
add.f32 f239, f200, f224;
sub.f32 f241, f200, f224;
add.f32 f1832, f201, f226;
sub.f32 f242, f201, f226;
mul.f32 f1830, f231, 0f3F6C835E;
mul.f32 f1831, f1834, 0f3EC3EF15;
sub.f32 f245, f1830, f1831;
mul.f32 f246, f1834, 0f3F6C835E;
fma.rn.f32 f247, f231, 0f3EC3EF15, f246;
mul.f32 f248, f235, 0f3F3504F3;
mul.f32 f249, f1833, 0f3F3504F3;
sub.f32 f250, f248, f249;
add.f32 f251, f248, f249;
mul.f32 f253, f1832, 0f3F6C835E;
mul.f32 f1829, f239, 0f3EC3EF15;
sub.f32 f254, f1829, f253;
mul.f32 f255, f1832, 0f3EC3EF15;
fma.rn.f32 f256, f239, 0f3F6C835E, f255;
mul.f32 f258, f234, 0f3F6C835E;
mul.f32 f1828, f233, 0fBEC3EF15;
sub.f32 f259, f1828, f258;
mul.f32 f260, f234, 0fBEC3EF15;
fma.rn.f32 f261, f233, 0f3F6C835E, f260;
mul.f32 f1826, f237, 0fBF3504F3;
mul.f32 f1827, f238, 0f3F3504F3;
sub.f32 f264, f1826, f1827;
mul.f32 f265, f238, 0fBF3504F3;
fma.rn.f32 f266, f237, 0f3F3504F3, f265;
mul.f32 f1824, f241, 0fBF6C835E;
mul.f32 f1825, f242, 0f3EC3EF15;
sub.f32 f269, f1824, f1825;
mul.f32 f270, f242, 0fBF6C835E;
fma.rn.f32 f271, f241, 0f3EC3EF15, f270;
add.f32 f272, f170, f227;
sub.f32 f274, f170, f227;
add.f32 f1823, f1854, f1835;
sub.f32 f275, f1854, f1835;
add.f32 f276, f174, f245;
sub.f32 f278, f174, f245;
add.f32 f1822, f1853, f247;
sub.f32 f279, f1853, f247;
add.f32 f280, f178, f250;
sub.f32 f282, f178, f250;
add.f32 f1821, f1852, f251;
sub.f32 f283, f1852, f251;
add.f32 f284, f182, f254;
sub.f32 f286, f182, f254;
add.f32 f1820, f1851, f256;
sub.f32 f287, f1851, f256;
sub.f32 f288, f172, f230;
add.f32 f290, f172, f230;
add.f32 f1819, f173, f229;
sub.f32 f291, f173, f229;
add.f32 f292, f176, f259;
sub.f32 f294, f176, f259;
add.f32 f1818, f177, f261;
sub.f32 f295, f177, f261;
add.f32 f296, f180, f264;
sub.f32 f298, f180, f264;
add.f32 f1817, f181, f266;
sub.f32 f299, f181, f266;
add.f32 f300, f184, f269;
sub.f32 f302, f184, f269;
add.f32 f1816, f185, f271;
sub.f32 f303, f185, f271;
add.f32 f304, %68, %100;
sub.f32 f306, %68, %100;
add.f32 f1813, %141, %140;
sub.f32 f307, %141, %140;
add.f32 f308, %84, %116;
sub.f32 f310, %84, %116;
add.f32 f1811, %85, %142;
sub.f32 f311, %85, %142;
add.f32 f312, f304, f308;
sub.f32 f314, f304, f308;
add.f32 f1810, f1813, f1811;
sub.f32 f315, f1813, f1811;
sub.f32 f316, f306, f311;
add.f32 f318, f306, f311;
add.f32 f1809, f307, f310;
sub.f32 f319, f307, f310;
add.f32 f320, %76, %108;
sub.f32 f322, %76, %108;
add.f32 f1807, %143, %109;
sub.f32 f323, %143, %109;
add.f32 f324, %92, %124;
sub.f32 f326, %92, %124;
add.f32 f1804, %145, %144;
sub.f32 f327, %145, %144;
add.f32 f328, f320, f324;
sub.f32 f330, f320, f324;
add.f32 f1803, f1807, f1804;
sub.f32 f331, f1807, f1804;
sub.f32 f332, f322, f327;
add.f32 f334, f322, f327;
add.f32 f1802, f323, f326;
sub.f32 f335, f323, f326;
mul.f32 f336, f332, 0f3F3504F3;
mul.f32 f337, f1802, 0f3F3504F3;
sub.f32 f338, f336, f337;
add.f32 f339, f336, f337;
mul.f32 f341, f335, 0f3F3504F3;
mul.f32 f1801, f334, 0fBF3504F3;
sub.f32 f342, f1801, f341;
mul.f32 f343, f335, 0fBF3504F3;
fma.rn.f32 f344, f334, 0f3F3504F3, f343;
add.f32 f345, f312, f328;
sub.f32 f347, f312, f328;
add.f32 f1800, f1810, f1803;
sub.f32 f348, f1810, f1803;
add.f32 f349, f316, f338;
sub.f32 f351, f316, f338;
add.f32 f1799, f1809, f339;
sub.f32 f352, f1809, f339;
sub.f32 f353, f314, f331;
add.f32 f355, f314, f331;
add.f32 f1798, f315, f330;
sub.f32 f356, f315, f330;
add.f32 f357, f318, f342;
sub.f32 f359, f318, f342;
add.f32 f1797, f319, f344;
sub.f32 f360, f319, f344;
add.f32 f361, %72, %104;
sub.f32 f363, %72, %104;
add.f32 f1795, %73, %146;
sub.f32 f364, %73, %146;
add.f32 f365, %88, %120;
sub.f32 f367, %88, %120;
add.f32 f1793, %147, %121;
sub.f32 f368, %147, %121;
add.f32 f369, f361, f365;
sub.f32 f371, f361, f365;
add.f32 f1792, f1795, f1793;
sub.f32 f372, f1795, f1793;
sub.f32 f373, f363, f368;
add.f32 f375, f363, f368;
add.f32 f1791, f364, f367;
sub.f32 f376, f364, f367;
add.f32 f377, %80, %112;
sub.f32 f379, %80, %112;
add.f32 f1788, %148, %149;
sub.f32 f380, %148, %149;
add.f32 f381, %96, %128;
sub.f32 f383, %96, %128;
add.f32 f1787, %97, %129;
sub.f32 f384, %97, %129;
add.f32 f385, f377, f381;
sub.f32 f387, f377, f381;
add.f32 f1786, f1788, f1787;
sub.f32 f388, f1788, f1787;
sub.f32 f389, f379, f384;
add.f32 f391, f379, f384;
add.f32 f1785, f380, f383;
sub.f32 f392, f380, f383;
mul.f32 f393, f389, 0f3F3504F3;
mul.f32 f394, f1785, 0f3F3504F3;
sub.f32 f395, f393, f394;
add.f32 f396, f393, f394;
mul.f32 f1783, f391, 0fBF3504F3;
mul.f32 f1784, f392, 0f3F3504F3;
sub.f32 f399, f1783, f1784;
mul.f32 f400, f392, 0fBF3504F3;
fma.rn.f32 f401, f391, 0f3F3504F3, f400;
add.f32 f402, f369, f385;
sub.f32 f404, f369, f385;
add.f32 f1782, f1792, f1786;
sub.f32 f405, f1792, f1786;
add.f32 f406, f373, f395;
sub.f32 f408, f373, f395;
add.f32 f1781, f1791, f396;
sub.f32 f409, f1791, f396;
sub.f32 f410, f371, f388;
add.f32 f412, f371, f388;
add.f32 f1780, f372, f387;
sub.f32 f413, f372, f387;
add.f32 f414, f375, f399;
sub.f32 f416, f375, f399;
add.f32 f1779, f376, f401;
sub.f32 f417, f376, f401;
mul.f32 f419, f1781, 0f3EC3EF15;
mul.f32 f1778, f406, 0f3F6C835E;
sub.f32 f420, f1778, f419;
mul.f32 f421, f1781, 0f3F6C835E;
fma.rn.f32 f422, f406, 0f3EC3EF15, f421;
mul.f32 f423, f410, 0f3F3504F3;
mul.f32 f424, f1780, 0f3F3504F3;
sub.f32 f425, f423, f424;
add.f32 f426, f423, f424;
mul.f32 f428, f1779, 0f3F6C835E;
mul.f32 f1777, f414, 0f3EC3EF15;
sub.f32 f429, f1777, f428;
mul.f32 f430, f1779, 0f3EC3EF15;
fma.rn.f32 f431, f414, 0f3F6C835E, f430;
mul.f32 f433, f409, 0f3F6C835E;
mul.f32 f1776, f408, 0fBEC3EF15;
sub.f32 f434, f1776, f433;
mul.f32 f435, f409, 0fBEC3EF15;
fma.rn.f32 f436, f408, 0f3F6C835E, f435;
mul.f32 f438, f413, 0f3F3504F3;
mul.f32 f1775, f412, 0fBF3504F3;
sub.f32 f439, f1775, f438;
mul.f32 f440, f413, 0fBF3504F3;
fma.rn.f32 f441, f412, 0f3F3504F3, f440;
mul.f32 f443, f417, 0f3EC3EF15;
mul.f32 f1774, f416, 0fBF6C835E;
sub.f32 f444, f1774, f443;
mul.f32 f445, f417, 0fBF6C835E;
fma.rn.f32 f446, f416, 0f3EC3EF15, f445;
add.f32 f447, f345, f402;
sub.f32 f449, f345, f402;
add.f32 f1773, f1800, f1782;
sub.f32 f450, f1800, f1782;
add.f32 f451, f349, f420;
sub.f32 f453, f349, f420;
add.f32 f1772, f1799, f422;
sub.f32 f454, f1799, f422;
add.f32 f455, f353, f425;
sub.f32 f457, f353, f425;
add.f32 f1771, f1798, f426;
sub.f32 f458, f1798, f426;
add.f32 f459, f357, f429;
sub.f32 f461, f357, f429;
add.f32 f1770, f1797, f431;
sub.f32 f462, f1797, f431;
sub.f32 f463, f347, f405;
add.f32 f465, f347, f405;
add.f32 f1769, f348, f404;
sub.f32 f466, f348, f404;
add.f32 f467, f351, f434;
sub.f32 f469, f351, f434;
add.f32 f1768, f352, f436;
sub.f32 f470, f352, f436;
add.f32 f471, f355, f439;
sub.f32 f473, f355, f439;
add.f32 f1767, f356, f441;
sub.f32 f474, f356, f441;
add.f32 f475, f359, f444;
sub.f32 f477, f359, f444;
add.f32 f1766, f360, f446;
sub.f32 f478, f360, f446;
mul.f32 f480, f1772, 0f3E47C5C2;
mul.f32 f1765, f451, 0f3F7B14BE;
sub.f32 f481, f1765, f480;
mul.f32 f482, f1772, 0f3F7B14BE;
fma.rn.f32 f483, f451, 0f3E47C5C2, f482;
mul.f32 f485, f1771, 0f3EC3EF15;
mul.f32 f1764, f455, 0f3F6C835E;
sub.f32 f486, f1764, f485;
mul.f32 f487, f1771, 0f3F6C835E;
fma.rn.f32 f488, f455, 0f3EC3EF15, f487;
mul.f32 f490, f1770, 0f3F0E39DA;
mul.f32 f1763, f459, 0f3F54DB31;
sub.f32 f491, f1763, f490;
mul.f32 f492, f1770, 0f3F54DB31;
fma.rn.f32 f493, f459, 0f3F0E39DA, f492;
mul.f32 f494, f463, 0f3F3504F3;
mul.f32 f495, f1769, 0f3F3504F3;
sub.f32 f496, f494, f495;
add.f32 f497, f494, f495;
mul.f32 f499, f1768, 0f3F54DB31;
mul.f32 f1762, f467, 0f3F0E39DA;
sub.f32 f500, f1762, f499;
mul.f32 f501, f1768, 0f3F0E39DA;
fma.rn.f32 f502, f467, 0f3F54DB31, f501;
mul.f32 f504, f1767, 0f3F6C835E;
mul.f32 f1761, f471, 0f3EC3EF15;
sub.f32 f505, f1761, f504;
mul.f32 f506, f1767, 0f3EC3EF15;
fma.rn.f32 f507, f471, 0f3F6C835E, f506;
mul.f32 f509, f1766, 0f3F7B14BE;
mul.f32 f1760, f475, 0f3E47C5C2;
sub.f32 f510, f1760, f509;
mul.f32 f511, f1766, 0f3E47C5C2;
fma.rn.f32 f512, f475, 0f3F7B14BE, f511;
mul.f32 f514, f454, 0f3F7B14BE;
mul.f32 f1759, f453, 0fBE47C5C2;
sub.f32 f515, f1759, f514;
mul.f32 f516, f454, 0fBE47C5C2;
fma.rn.f32 f517, f453, 0f3F7B14BE, f516;
mul.f32 f1757, f457, 0fBEC3EF15;
mul.f32 f1758, f458, 0f3F6C835E;
sub.f32 f520, f1757, f1758;
mul.f32 f521, f458, 0fBEC3EF15;
fma.rn.f32 f522, f457, 0f3F6C835E, f521;
mul.f32 f1755, f461, 0fBF0E39DA;
mul.f32 f1756, f462, 0f3F54DB31;
sub.f32 f525, f1755, f1756;
mul.f32 f526, f462, 0fBF0E39DA;
fma.rn.f32 f527, f461, 0f3F54DB31, f526;
mul.f32 f1753, f465, 0fBF3504F3;
mul.f32 f1754, f466, 0f3F3504F3;
sub.f32 f530, f1753, f1754;
mul.f32 f531, f466, 0fBF3504F3;
fma.rn.f32 f532, f465, 0f3F3504F3, f531;
mul.f32 f1751, f469, 0fBF54DB31;
mul.f32 f1752, f470, 0f3F0E39DA;
sub.f32 f535, f1751, f1752;
mul.f32 f536, f470, 0fBF54DB31;
fma.rn.f32 f537, f469, 0f3F0E39DA, f536;
mul.f32 f539, f474, 0f3EC3EF15;
mul.f32 f1750, f473, 0fBF6C835E;
sub.f32 f540, f1750, f539;
mul.f32 f541, f474, 0fBF6C835E;
fma.rn.f32 f542, f473, 0f3EC3EF15, f541;
mul.f32 f544, f478, 0f3E47C5C2;
mul.f32 f1749, f477, 0fBF7B14BE;
sub.f32 f545, f1749, f544;
mul.f32 f546, f478, 0fBF7B14BE;
fma.rn.f32 f547, f477, 0f3E47C5C2, f546;
add.f32 f550, f276, f481;
sub.f32 f552, f276, f481;
add.f32 f1748, f1822, f483;
sub.f32 f553, f1822, f483;
add.f32 f554, f280, f486;
sub.f32 f556, f280, f486;
add.f32 f1747, f1821, f488;
sub.f32 f557, f1821, f488;
add.f32 f558, f284, f491;
sub.f32 f560, f284, f491;
add.f32 f1746, f1820, f493;
sub.f32 f561, f1820, f493;
add.f32 f562, f288, f496;
sub.f32 f564, f288, f496;
add.f32 f1745, f1819, f497;
sub.f32 f565, f1819, f497;
add.f32 f566, f292, f500;
sub.f32 f568, f292, f500;
add.f32 f1744, f1818, f502;
sub.f32 f569, f1818, f502;
add.f32 f570, f296, f505;
sub.f32 f572, f296, f505;
add.f32 f1743, f1817, f507;
sub.f32 f573, f1817, f507;
add.f32 f574, f300, f510;
sub.f32 f576, f300, f510;
add.f32 f1742, f1816, f512;
sub.f32 f577, f1816, f512;
sub.f32 f578, f274, f450;
add.f32 f580, f274, f450;
add.f32 f1741, f275, f449;
sub.f32 f581, f275, f449;
add.f32 f582, f278, f515;
sub.f32 f584, f278, f515;
add.f32 f1740, f279, f517;
sub.f32 f585, f279, f517;
add.f32 f586, f282, f520;
sub.f32 f588, f282, f520;
add.f32 f1739, f283, f522;
sub.f32 f589, f283, f522;
add.f32 f590, f286, f525;
sub.f32 f592, f286, f525;
add.f32 f1738, f287, f527;
sub.f32 f593, f287, f527;
add.f32 f594, f290, f530;
sub.f32 f596, f290, f530;
add.f32 f1737, f291, f532;
sub.f32 f597, f291, f532;
add.f32 f598, f294, f535;
sub.f32 f600, f294, f535;
add.f32 f1736, f295, f537;
sub.f32 f601, f295, f537;
add.f32 f602, f298, f540;
sub.f32 f604, f298, f540;
add.f32 f1735, f299, f542;
sub.f32 f605, f299, f542;
add.f32 f606, f302, f545;
sub.f32 f608, f302, f545;
add.f32 f1734, f303, f547;
sub.f32 f609, f303, f547;
mov.u32 r15, %tid.x;
shl.b32 r7, r15, 8;
and.b32 r8, r7, -8192;
add.s32 r9, r4, r8;
and.b32 r14, r15, 31;
shl.b32 r10, r15, 3;
cvt.u64.u32 rd2, r10;
and.b64 rd3, rd2, 248;
mov.u64 rd4, %65;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f610, f611}, [rd5];
mul.f32 f614, f1748, f611;
mul.f32 f616, f610, f1748;
mul.f32 f618, f611, f611;
mul.f32 f1733, f610, f610;
sub.f32 f619, f1733, f618;
mul.f32 f620, f611, f610;
fma.rn.f32 f621, f611, f610, f620;
mul.f32 f622, f1747, f621;
mul.f32 f624, f619, f1747;
mul.f32 f626, f611, f621;
mul.f32 f1732, f610, f619;
sub.f32 f627, f1732, f626;
mul.f32 f1731, f554, f621;
mul.f32 f628, f610, f621;
fma.rn.f32 f629, f611, f619, f628;
mul.f32 f630, f1746, f629;
mul.f32 f632, f627, f1746;
mul.f32 f1729, f610, f627;
mul.f32 f1730, f611, f629;
sub.f32 f635, f1729, f1730;
mul.f32 f1728, f558, f629;
mul.f32 f636, f610, f629;
fma.rn.f32 f637, f611, f627, f636;
mul.f32 f638, f1745, f637;
mul.f32 f640, f635, f1745;
mul.f32 f642, f611, f637;
mul.f32 f1727, f610, f635;
sub.f32 f643, f1727, f642;
mul.f32 f1726, f562, f637;
mul.f32 f644, f610, f637;
fma.rn.f32 f645, f611, f635, f644;
mul.f32 f646, f1744, f645;
mul.f32 f648, f643, f1744;
mul.f32 f1724, f610, f643;
mul.f32 f1725, f611, f645;
sub.f32 f651, f1724, f1725;
mul.f32 f1723, f566, f645;
mul.f32 f652, f610, f645;
fma.rn.f32 f653, f611, f643, f652;
mul.f32 f654, f1743, f653;
mul.f32 f656, f651, f1743;
mul.f32 f658, f611, f653;
mul.f32 f1722, f610, f651;
sub.f32 f659, f1722, f658;
mul.f32 f1721, f570, f653;
mul.f32 f660, f610, f653;
fma.rn.f32 f661, f611, f651, f660;
mul.f32 f662, f1742, f661;
mul.f32 f664, f659, f1742;
mul.f32 f666, f611, f661;
mul.f32 f1720, f610, f659;
sub.f32 f667, f1720, f666;
mul.f32 f1719, f574, f661;
mul.f32 f668, f610, f661;
fma.rn.f32 f669, f611, f659, f668;
mul.f32 f670, f1741, f669;
mul.f32 f672, f667, f1741;
mul.f32 f1717, f610, f667;
mul.f32 f1718, f611, f669;
sub.f32 f675, f1717, f1718;
mul.f32 f1716, f578, f669;
mul.f32 f676, f610, f669;
fma.rn.f32 f677, f611, f667, f676;
mul.f32 f678, f1740, f677;
mul.f32 f680, f675, f1740;
mul.f32 f682, f611, f677;
mul.f32 f1715, f610, f675;
sub.f32 f683, f1715, f682;
mul.f32 f1714, f582, f677;
mul.f32 f684, f610, f677;
fma.rn.f32 f685, f611, f675, f684;
mul.f32 f686, f1739, f685;
mul.f32 f688, f683, f1739;
mul.f32 f690, f611, f685;
mul.f32 f1713, f610, f683;
sub.f32 f691, f1713, f690;
mul.f32 f1712, f586, f685;
mul.f32 f692, f610, f685;
fma.rn.f32 f693, f611, f683, f692;
mul.f32 f694, f1738, f693;
mul.f32 f696, f691, f1738;
mul.f32 f1710, f610, f691;
mul.f32 f1711, f611, f693;
sub.f32 f699, f1710, f1711;
mul.f32 f1709, f590, f693;
mul.f32 f700, f610, f693;
fma.rn.f32 f701, f611, f691, f700;
mul.f32 f702, f1737, f701;
mul.f32 f704, f699, f1737;
mul.f32 f706, f611, f701;
mul.f32 f1708, f610, f699;
sub.f32 f707, f1708, f706;
mul.f32 f1707, f594, f701;
mul.f32 f708, f610, f701;
fma.rn.f32 f709, f611, f699, f708;
mul.f32 f710, f1736, f709;
mul.f32 f712, f707, f1736;
mul.f32 f1705, f610, f707;
mul.f32 f1706, f611, f709;
sub.f32 f715, f1705, f1706;
mul.f32 f1704, f598, f709;
mul.f32 f716, f610, f709;
fma.rn.f32 f717, f611, f707, f716;
mul.f32 f718, f1735, f717;
mul.f32 f720, f715, f1735;
mul.f32 f722, f611, f717;
mul.f32 f1703, f610, f715;
sub.f32 f723, f1703, f722;
mul.f32 f1702, f602, f717;
mul.f32 f724, f610, f717;
fma.rn.f32 f725, f611, f715, f724;
mul.f32 f726, f1734, f725;
mul.f32 f728, f723, f1734;
mul.f32 f730, f611, f725;
mul.f32 f1701, f610, f723;
sub.f32 f731, f1701, f730;
mul.f32 f1700, f606, f725;
mul.f32 f732, f610, f725;
fma.rn.f32 f733, f611, f723, f732;
sub.f32 f1699, f1823, f1773;
mul.f32 f734, f1699, f733;
mul.f32 f736, f731, f1699;
mul.f32 f1697, f610, f731;
mul.f32 f1698, f611, f733;
sub.f32 f739, f1697, f1698;
sub.f32 f1696, f272, f447;
mul.f32 f1695, f1696, f733;
mul.f32 f740, f610, f733;
fma.rn.f32 f741, f611, f731, f740;
mul.f32 f742, f553, f741;
mul.f32 f744, f739, f553;
mul.f32 f746, f611, f741;
mul.f32 f1694, f610, f739;
sub.f32 f747, f1694, f746;
mul.f32 f1693, f552, f741;
mul.f32 f748, f610, f741;
fma.rn.f32 f749, f611, f739, f748;
mul.f32 f750, f557, f749;
mul.f32 f752, f747, f557;
mul.f32 f754, f611, f749;
mul.f32 f1692, f610, f747;
sub.f32 f755, f1692, f754;
mul.f32 f1691, f556, f749;
mul.f32 f756, f610, f749;
fma.rn.f32 f757, f611, f747, f756;
mul.f32 f758, f561, f757;
mul.f32 f760, f755, f561;
mul.f32 f1689, f610, f755;
mul.f32 f1690, f611, f757;
sub.f32 f763, f1689, f1690;
mul.f32 f1688, f560, f757;
mul.f32 f764, f610, f757;
fma.rn.f32 f765, f611, f755, f764;
mul.f32 f766, f565, f765;
mul.f32 f768, f763, f565;
mul.f32 f770, f611, f765;
mul.f32 f1687, f610, f763;
sub.f32 f771, f1687, f770;
mul.f32 f1686, f564, f765;
mul.f32 f772, f610, f765;
fma.rn.f32 f773, f611, f763, f772;
mul.f32 f774, f569, f773;
mul.f32 f776, f771, f569;
mul.f32 f1684, f610, f771;
mul.f32 f1685, f611, f773;
sub.f32 f779, f1684, f1685;
mul.f32 f1683, f568, f773;
mul.f32 f780, f610, f773;
fma.rn.f32 f781, f611, f771, f780;
mul.f32 f782, f573, f781;
mul.f32 f784, f779, f573;
mul.f32 f786, f611, f781;
mul.f32 f1682, f610, f779;
sub.f32 f787, f1682, f786;
mul.f32 f1681, f572, f781;
mul.f32 f788, f610, f781;
fma.rn.f32 f789, f611, f779, f788;
mul.f32 f790, f577, f789;
mul.f32 f792, f787, f577;
mul.f32 f794, f611, f789;
mul.f32 f1680, f610, f787;
sub.f32 f795, f1680, f794;
mul.f32 f1679, f576, f789;
mul.f32 f796, f610, f789;
fma.rn.f32 f797, f611, f787, f796;
mul.f32 f798, f581, f797;
mul.f32 f800, f795, f581;
mul.f32 f1677, f610, f795;
mul.f32 f1678, f611, f797;
sub.f32 f803, f1677, f1678;
mul.f32 f1676, f580, f797;
mul.f32 f804, f610, f797;
fma.rn.f32 f805, f611, f795, f804;
mul.f32 f806, f585, f805;
mul.f32 f808, f803, f585;
mul.f32 f810, f611, f805;
mul.f32 f1675, f610, f803;
sub.f32 f811, f1675, f810;
mul.f32 f1674, f584, f805;
mul.f32 f812, f610, f805;
fma.rn.f32 f813, f611, f803, f812;
mul.f32 f814, f589, f813;
mul.f32 f816, f811, f589;
mul.f32 f818, f611, f813;
mul.f32 f1673, f610, f811;
sub.f32 f819, f1673, f818;
mul.f32 f1672, f588, f813;
mul.f32 f820, f610, f813;
fma.rn.f32 f821, f611, f811, f820;
mul.f32 f822, f593, f821;
mul.f32 f824, f819, f593;
mul.f32 f1670, f610, f819;
mul.f32 f1671, f611, f821;
sub.f32 f827, f1670, f1671;
mul.f32 f1669, f592, f821;
mul.f32 f828, f610, f821;
fma.rn.f32 f829, f611, f819, f828;
mul.f32 f830, f597, f829;
mul.f32 f832, f827, f597;
mul.f32 f834, f611, f829;
mul.f32 f1668, f610, f827;
sub.f32 f835, f1668, f834;
mul.f32 f1667, f596, f829;
mul.f32 f836, f610, f829;
fma.rn.f32 f837, f611, f827, f836;
mul.f32 f838, f601, f837;
mul.f32 f840, f835, f601;
mul.f32 f1665, f610, f835;
mul.f32 f1666, f611, f837;
sub.f32 f843, f1665, f1666;
mul.f32 f1664, f600, f837;
mul.f32 f844, f610, f837;
fma.rn.f32 f845, f611, f835, f844;
mul.f32 f846, f605, f845;
mul.f32 f848, f843, f605;
mul.f32 f850, f611, f845;
mul.f32 f1663, f610, f843;
sub.f32 f851, f1663, f850;
mul.f32 f1662, f604, f845;
mul.f32 f852, f610, f845;
mul.f32 f1661, f550, f611;
fma.rn.f32 f853, f611, f843, f852;
mul.f32 f854, f609, f853;
mul.f32 f855, f608, f853;
mul.f32 f856, f851, f609;
barrier.sync 0;
and.b32 r11, r7, 7936;
add.s32 r12, r9, r11;
mov.u32 r17, %tid.x;
and.b32 r16, r17, 31;
sub.f32 f1871, f1823, f1773;
mul.f32 f1870, f731, f1871;
add.f32 f857, f1823, f1773;
mov.u32 r19, %tid.x;
and.b32 r18, r19, 31;
sub.f32 f1872, f272, f447;
add.f32 f858, f272, f447;
mov.u32 r21, %tid.x;
and.b32 r20, r21, 31;
mov.u32 r23, %tid.x;
and.b32 r22, r23, 31;
fma.rn.f32 f859, f610, f550, f614;
sub.f32 f860, f616, f1661;
st.shared.v4.f32 [r12], {f858, f857, f859, f860};
fma.rn.f32 f861, f619, f554, f622;
sub.f32 f862, f624, f1731;
fma.rn.f32 f863, f627, f558, f630;
sub.f32 f864, f632, f1728;
st.shared.v4.f32 [r12+16], {f861, f862, f863, f864};
fma.rn.f32 f865, f635, f562, f638;
sub.f32 f866, f640, f1726;
sub.f32 f867, f648, f1723;
fma.rn.f32 f868, f643, f566, f646;
st.shared.v4.f32 [r12+32], {f865, f866, f868, f867};
fma.rn.f32 f869, f651, f570, f654;
sub.f32 f870, f656, f1721;
fma.rn.f32 f871, f659, f574, f662;
sub.f32 f872, f664, f1719;
st.shared.v4.f32 [r12+48], {f869, f870, f871, f872};
fma.rn.f32 f873, f667, f578, f670;
sub.f32 f874, f672, f1716;
fma.rn.f32 f875, f675, f582, f678;
sub.f32 f876, f680, f1714;
st.shared.v4.f32 [r12+64], {f873, f874, f875, f876};
fma.rn.f32 f877, f683, f586, f686;
sub.f32 f878, f688, f1712;
fma.rn.f32 f879, f691, f590, f694;
sub.f32 f880, f696, f1709;
st.shared.v4.f32 [r12+80], {f877, f878, f879, f880};
fma.rn.f32 f881, f699, f594, f702;
sub.f32 f882, f704, f1707;
fma.rn.f32 f883, f707, f598, f710;
sub.f32 f884, f712, f1704;
st.shared.v4.f32 [r12+96], {f881, f882, f883, f884};
fma.rn.f32 f885, f715, f602, f718;
sub.f32 f886, f720, f1702;
fma.rn.f32 f887, f723, f606, f726;
sub.f32 f888, f728, f1700;
st.shared.v4.f32 [r12+112], {f885, f886, f887, f888};
fma.rn.f32 f889, f731, f1872, f734;
sub.f32 f890, f1870, f1695;
fma.rn.f32 f891, f739, f552, f742;
sub.f32 f892, f744, f1693;
st.shared.v4.f32 [r12+128], {f889, f890, f891, f892};
fma.rn.f32 f893, f747, f556, f750;
sub.f32 f894, f752, f1691;
fma.rn.f32 f895, f755, f560, f758;
sub.f32 f896, f760, f1688;
st.shared.v4.f32 [r12+144], {f893, f894, f895, f896};
fma.rn.f32 f897, f763, f564, f766;
sub.f32 f898, f768, f1686;
fma.rn.f32 f899, f771, f568, f774;
sub.f32 f900, f776, f1683;
st.shared.v4.f32 [r12+160], {f897, f898, f899, f900};
fma.rn.f32 f901, f779, f572, f782;
sub.f32 f902, f784, f1681;
fma.rn.f32 f903, f787, f576, f790;
sub.f32 f904, f792, f1679;
st.shared.v4.f32 [r12+176], {f901, f902, f903, f904};
fma.rn.f32 f905, f795, f580, f798;
sub.f32 f906, f800, f1676;
fma.rn.f32 f907, f803, f584, f806;
sub.f32 f908, f808, f1674;
st.shared.v4.f32 [r12+192], {f905, f906, f907, f908};
fma.rn.f32 f909, f811, f588, f814;
sub.f32 f910, f816, f1672;
fma.rn.f32 f911, f819, f592, f822;
sub.f32 f912, f824, f1669;
st.shared.v4.f32 [r12+208], {f909, f910, f911, f912};
fma.rn.f32 f913, f827, f596, f830;
sub.f32 f914, f832, f1667;
fma.rn.f32 f915, f835, f600, f838;
sub.f32 f916, f840, f1664;
st.shared.v4.f32 [r12+224], {f913, f914, f915, f916};
fma.rn.f32 f917, f843, f604, f846;
sub.f32 f918, f848, f1662;
fma.rn.f32 f919, f851, f608, f854;
sub.f32 f920, f856, f855;
st.shared.v4.f32 [r12+240], {f917, f918, f919, f920};
barrier.sync 0;
mad.lo.s32 r13, r22, -248, r12;
ld.shared.v2.f32 {f921, f922}, [r13];
ld.shared.v2.f32 {f925, f926}, [r13+256];
ld.shared.v2.f32 {f929, f930}, [r13+512];
ld.shared.v2.f32 {f933, f934}, [r13+768];
ld.shared.v2.f32 {f937, f938}, [r13+1024];
ld.shared.v2.f32 {f941, f942}, [r13+1280];
ld.shared.v2.f32 {f945, f946}, [r13+1536];
ld.shared.v2.f32 {f949, f950}, [r13+1792];
ld.shared.v2.f32 {f953, f954}, [r13+2048];
ld.shared.v2.f32 {f957, f958}, [r13+2304];
ld.shared.v2.f32 {f961, f962}, [r13+2560];
ld.shared.v2.f32 {f965, f966}, [r13+2816];
ld.shared.v2.f32 {f969, f970}, [r13+3072];
ld.shared.v2.f32 {f973, f974}, [r13+3328];
ld.shared.v2.f32 {f977, f978}, [r13+3584];
ld.shared.v2.f32 {f981, f982}, [r13+3840];
ld.shared.v2.f32 {f985, f986}, [r13+4096];
ld.shared.v2.f32 {f989, f990}, [r13+4352];
ld.shared.v2.f32 {f993, f994}, [r13+4608];
ld.shared.v2.f32 {f997, f998}, [r13+4864];
ld.shared.v2.f32 {f1001, f1002}, [r13+5120];
ld.shared.v2.f32 {f1005, f1006}, [r13+5376];
ld.shared.v2.f32 {f1009, f1010}, [r13+5632];
ld.shared.v2.f32 {f1013, f1014}, [r13+5888];
ld.shared.v2.f32 {f1017, f1018}, [r13+6144];
ld.shared.v2.f32 {f1021, f1022}, [r13+6400];
ld.shared.v2.f32 {f1025, f1026}, [r13+6656];
ld.shared.v2.f32 {f1029, f1030}, [r13+6912];
ld.shared.v2.f32 {f1033, f1034}, [r13+7168];
ld.shared.v2.f32 {f1037, f1038}, [r13+7424];
ld.shared.v2.f32 {f1041, f1042}, [r13+7680];
ld.shared.v2.f32 {f1045, f1046}, [r13+7936];
add.f32 f1049, f921, f985;
sub.f32 f1051, f921, f985;
add.f32 f1660, f922, f986;
sub.f32 f1052, f922, f986;
add.f32 f1053, f953, f1017;
sub.f32 f1055, f953, f1017;
add.f32 f1659, f954, f1018;
sub.f32 f1056, f954, f1018;
add.f32 f1057, f1049, f1053;
sub.f32 f1059, f1049, f1053;
add.f32 f1658, f1660, f1659;
sub.f32 f1060, f1660, f1659;
sub.f32 f1061, f1051, f1056;
add.f32 f1063, f1051, f1056;
add.f32 f1657, f1052, f1055;
sub.f32 f1064, f1052, f1055;
add.f32 f1065, f937, f1001;
sub.f32 f1067, f937, f1001;
add.f32 f1656, f938, f1002;
sub.f32 f1068, f938, f1002;
add.f32 f1069, f969, f1033;
sub.f32 f1071, f969, f1033;
add.f32 f1655, f970, f1034;
sub.f32 f1072, f970, f1034;
add.f32 f1073, f1065, f1069;
sub.f32 f1075, f1065, f1069;
add.f32 f1654, f1656, f1655;
sub.f32 f1076, f1656, f1655;
sub.f32 f1077, f1067, f1072;
add.f32 f1079, f1067, f1072;
add.f32 f1653, f1068, f1071;
sub.f32 f1080, f1068, f1071;
mul.f32 f1081, f1077, 0f3F3504F3;
mul.f32 f1082, f1653, 0f3F3504F3;
sub.f32 f1083, f1081, f1082;
add.f32 f1084, f1081, f1082;
mul.f32 f1651, f1079, 0fBF3504F3;
mul.f32 f1652, f1080, 0f3F3504F3;
sub.f32 f1087, f1651, f1652;
mul.f32 f1088, f1080, 0fBF3504F3;
fma.rn.f32 f1089, f1079, 0f3F3504F3, f1088;
add.f32 f1090, f1057, f1073;
sub.f32 f1092, f1057, f1073;
add.f32 f1650, f1658, f1654;
sub.f32 f1093, f1658, f1654;
add.f32 f1094, f1061, f1083;
sub.f32 f1096, f1061, f1083;
add.f32 f1649, f1657, f1084;
sub.f32 f1097, f1657, f1084;
sub.f32 f1098, f1059, f1076;
add.f32 f1100, f1059, f1076;
add.f32 f1648, f1060, f1075;
sub.f32 f1101, f1060, f1075;
add.f32 f1102, f1063, f1087;
sub.f32 f1104, f1063, f1087;
add.f32 f1647, f1064, f1089;
sub.f32 f1105, f1064, f1089;
add.f32 f1106, f929, f993;
sub.f32 f1108, f929, f993;
add.f32 f1646, f930, f994;
sub.f32 f1109, f930, f994;
add.f32 f1110, f961, f1025;
sub.f32 f1112, f961, f1025;
add.f32 f1645, f962, f1026;
sub.f32 f1113, f962, f1026;
add.f32 f1114, f1106, f1110;
sub.f32 f1116, f1106, f1110;
add.f32 f1644, f1646, f1645;
sub.f32 f1117, f1646, f1645;
sub.f32 f1118, f1108, f1113;
add.f32 f1120, f1108, f1113;
add.f32 f1643, f1109, f1112;
sub.f32 f1121, f1109, f1112;
add.f32 f1122, f945, f1009;
sub.f32 f1124, f945, f1009;
add.f32 f1642, f946, f1010;
sub.f32 f1125, f946, f1010;
add.f32 f1126, f977, f1041;
sub.f32 f1128, f977, f1041;
add.f32 f1641, f978, f1042;
sub.f32 f1129, f978, f1042;
add.f32 f1130, f1122, f1126;
sub.f32 f1132, f1122, f1126;
add.f32 f1640, f1642, f1641;
sub.f32 f1133, f1642, f1641;
sub.f32 f1134, f1124, f1129;
add.f32 f1136, f1124, f1129;
add.f32 f1639, f1125, f1128;
sub.f32 f1137, f1125, f1128;
mul.f32 f1138, f1134, 0f3F3504F3;
mul.f32 f1139, f1639, 0f3F3504F3;
sub.f32 f1140, f1138, f1139;
add.f32 f1141, f1138, f1139;
mul.f32 f1637, f1136, 0fBF3504F3;
mul.f32 f1638, f1137, 0f3F3504F3;
sub.f32 f1144, f1637, f1638;
mul.f32 f1145, f1137, 0fBF3504F3;
fma.rn.f32 f1146, f1136, 0f3F3504F3, f1145;
add.f32 f1147, f1114, f1130;
sub.f32 f1149, f1114, f1130;
add.f32 f1636, f1644, f1640;
sub.f32 f1150, f1644, f1640;
add.f32 f1151, f1118, f1140;
sub.f32 f1153, f1118, f1140;
add.f32 f1635, f1643, f1141;
sub.f32 f1154, f1643, f1141;
sub.f32 f1155, f1116, f1133;
add.f32 f1157, f1116, f1133;
add.f32 f1634, f1117, f1132;
sub.f32 f1158, f1117, f1132;
add.f32 f1159, f1120, f1144;
sub.f32 f1161, f1120, f1144;
add.f32 f1633, f1121, f1146;
sub.f32 f1162, f1121, f1146;
mul.f32 f1631, f1151, 0f3F6C835E;
mul.f32 f1632, f1635, 0f3EC3EF15;
sub.f32 f1165, f1631, f1632;
mul.f32 f1166, f1635, 0f3F6C835E;
fma.rn.f32 f1167, f1151, 0f3EC3EF15, f1166;
mul.f32 f1168, f1155, 0f3F3504F3;
mul.f32 f1169, f1634, 0f3F3504F3;
sub.f32 f1170, f1168, f1169;
add.f32 f1171, f1168, f1169;
mul.f32 f1629, f1159, 0f3EC3EF15;
mul.f32 f1630, f1633, 0f3F6C835E;
sub.f32 f1174, f1629, f1630;
mul.f32 f1175, f1633, 0f3EC3EF15;
fma.rn.f32 f1176, f1159, 0f3F6C835E, f1175;
mul.f32 f1627, f1153, 0fBEC3EF15;
mul.f32 f1628, f1154, 0f3F6C835E;
sub.f32 f1179, f1627, f1628;
mul.f32 f1180, f1154, 0fBEC3EF15;
fma.rn.f32 f1181, f1153, 0f3F6C835E, f1180;
mul.f32 f1625, f1157, 0fBF3504F3;
mul.f32 f1626, f1158, 0f3F3504F3;
sub.f32 f1184, f1625, f1626;
mul.f32 f1185, f1158, 0fBF3504F3;
fma.rn.f32 f1186, f1157, 0f3F3504F3, f1185;
mul.f32 f1623, f1161, 0fBF6C835E;
mul.f32 f1624, f1162, 0f3EC3EF15;
sub.f32 f1189, f1623, f1624;
mul.f32 f1190, f1162, 0fBF6C835E;
fma.rn.f32 f1191, f1161, 0f3EC3EF15, f1190;
add.f32 f1192, f1090, f1147;
sub.f32 f1194, f1090, f1147;
add.f32 f1622, f1650, f1636;
sub.f32 f1195, f1650, f1636;
add.f32 f1196, f1094, f1165;
sub.f32 f1198, f1094, f1165;
add.f32 f1621, f1649, f1167;
sub.f32 f1199, f1649, f1167;
add.f32 f1200, f1098, f1170;
sub.f32 f1202, f1098, f1170;
add.f32 f1620, f1648, f1171;
sub.f32 f1203, f1648, f1171;
add.f32 f1204, f1102, f1174;
sub.f32 f1206, f1102, f1174;
add.f32 f1619, f1647, f1176;
sub.f32 f1207, f1647, f1176;
sub.f32 f1208, f1092, f1150;
add.f32 f1210, f1092, f1150;
add.f32 f1618, f1093, f1149;
sub.f32 f1211, f1093, f1149;
add.f32 f1212, f1096, f1179;
sub.f32 f1214, f1096, f1179;
add.f32 f1617, f1097, f1181;
sub.f32 f1215, f1097, f1181;
add.f32 f1216, f1100, f1184;
sub.f32 f1218, f1100, f1184;
add.f32 f1616, f1101, f1186;
sub.f32 f1219, f1101, f1186;
add.f32 f1220, f1104, f1189;
sub.f32 f1222, f1104, f1189;
add.f32 f1615, f1105, f1191;
sub.f32 f1223, f1105, f1191;
add.f32 f1224, f925, f989;
sub.f32 f1226, f925, f989;
add.f32 f1614, f926, f990;
sub.f32 f1227, f926, f990;
add.f32 f1228, f957, f1021;
sub.f32 f1230, f957, f1021;
add.f32 f1613, f958, f1022;
sub.f32 f1231, f958, f1022;
add.f32 f1232, f1224, f1228;
sub.f32 f1234, f1224, f1228;
add.f32 f1612, f1614, f1613;
sub.f32 f1235, f1614, f1613;
sub.f32 f1236, f1226, f1231;
add.f32 f1238, f1226, f1231;
add.f32 f1611, f1227, f1230;
sub.f32 f1239, f1227, f1230;
add.f32 f1240, f941, f1005;
sub.f32 f1242, f941, f1005;
add.f32 f1610, f942, f1006;
sub.f32 f1243, f942, f1006;
add.f32 f1244, f973, f1037;
sub.f32 f1246, f973, f1037;
add.f32 f1609, f974, f1038;
sub.f32 f1247, f974, f1038;
add.f32 f1248, f1240, f1244;
sub.f32 f1250, f1240, f1244;
add.f32 f1608, f1610, f1609;
sub.f32 f1251, f1610, f1609;
sub.f32 f1252, f1242, f1247;
add.f32 f1254, f1242, f1247;
add.f32 f1607, f1243, f1246;
sub.f32 f1255, f1243, f1246;
mul.f32 f1256, f1252, 0f3F3504F3;
mul.f32 f1257, f1607, 0f3F3504F3;
sub.f32 f1258, f1256, f1257;
add.f32 f1259, f1256, f1257;
mul.f32 f1605, f1254, 0fBF3504F3;
mul.f32 f1606, f1255, 0f3F3504F3;
sub.f32 f1262, f1605, f1606;
mul.f32 f1263, f1255, 0fBF3504F3;
fma.rn.f32 f1264, f1254, 0f3F3504F3, f1263;
add.f32 f1265, f1232, f1248;
sub.f32 f1267, f1232, f1248;
add.f32 f1604, f1612, f1608;
sub.f32 f1268, f1612, f1608;
add.f32 f1269, f1236, f1258;
sub.f32 f1271, f1236, f1258;
add.f32 f1603, f1611, f1259;
sub.f32 f1272, f1611, f1259;
sub.f32 f1273, f1234, f1251;
add.f32 f1275, f1234, f1251;
add.f32 f1602, f1235, f1250;
sub.f32 f1276, f1235, f1250;
add.f32 f1277, f1238, f1262;
sub.f32 f1279, f1238, f1262;
add.f32 f1601, f1239, f1264;
sub.f32 f1280, f1239, f1264;
add.f32 f1281, f933, f997;
sub.f32 f1283, f933, f997;
add.f32 f1600, f934, f998;
sub.f32 f1284, f934, f998;
add.f32 f1285, f965, f1029;
sub.f32 f1287, f965, f1029;
add.f32 f1599, f966, f1030;
sub.f32 f1288, f966, f1030;
add.f32 f1289, f1281, f1285;
sub.f32 f1291, f1281, f1285;
add.f32 f1598, f1600, f1599;
sub.f32 f1292, f1600, f1599;
sub.f32 f1293, f1283, f1288;
add.f32 f1295, f1283, f1288;
add.f32 f1597, f1284, f1287;
sub.f32 f1296, f1284, f1287;
add.f32 f1297, f949, f1013;
sub.f32 f1299, f949, f1013;
add.f32 f1596, f950, f1014;
sub.f32 f1300, f950, f1014;
add.f32 f1301, f981, f1045;
sub.f32 f1303, f981, f1045;
add.f32 f1595, f982, f1046;
sub.f32 f1304, f982, f1046;
add.f32 f1305, f1297, f1301;
sub.f32 f1307, f1297, f1301;
add.f32 f1594, f1596, f1595;
sub.f32 f1308, f1596, f1595;
sub.f32 f1309, f1299, f1304;
add.f32 f1311, f1299, f1304;
add.f32 f1593, f1300, f1303;
sub.f32 f1312, f1300, f1303;
mul.f32 f1313, f1309, 0f3F3504F3;
mul.f32 f1314, f1593, 0f3F3504F3;
sub.f32 f1315, f1313, f1314;
add.f32 f1316, f1313, f1314;
mul.f32 f1591, f1311, 0fBF3504F3;
mul.f32 f1592, f1312, 0f3F3504F3;
sub.f32 f1319, f1591, f1592;
mul.f32 f1320, f1312, 0fBF3504F3;
fma.rn.f32 f1321, f1311, 0f3F3504F3, f1320;
add.f32 f1322, f1289, f1305;
sub.f32 f1324, f1289, f1305;
add.f32 f1590, f1598, f1594;
sub.f32 f1325, f1598, f1594;
add.f32 f1326, f1293, f1315;
sub.f32 f1328, f1293, f1315;
add.f32 f1589, f1597, f1316;
sub.f32 f1329, f1597, f1316;
sub.f32 f1330, f1291, f1308;
add.f32 f1332, f1291, f1308;
add.f32 f1588, f1292, f1307;
sub.f32 f1333, f1292, f1307;
add.f32 f1334, f1295, f1319;
sub.f32 f1336, f1295, f1319;
add.f32 f1587, f1296, f1321;
sub.f32 f1337, f1296, f1321;
mul.f32 f1585, f1326, 0f3F6C835E;
mul.f32 f1586, f1589, 0f3EC3EF15;
sub.f32 f1340, f1585, f1586;
mul.f32 f1341, f1589, 0f3F6C835E;
fma.rn.f32 f1342, f1326, 0f3EC3EF15, f1341;
mul.f32 f1343, f1330, 0f3F3504F3;
mul.f32 f1344, f1588, 0f3F3504F3;
sub.f32 f1345, f1343, f1344;
add.f32 f1346, f1343, f1344;
mul.f32 f1348, f1587, 0f3F6C835E;
mul.f32 f1584, f1334, 0f3EC3EF15;
sub.f32 f1349, f1584, f1348;
mul.f32 f1350, f1587, 0f3EC3EF15;
fma.rn.f32 f1351, f1334, 0f3F6C835E, f1350;
mul.f32 f1353, f1329, 0f3F6C835E;
mul.f32 f1583, f1328, 0fBEC3EF15;
sub.f32 f1354, f1583, f1353;
mul.f32 f1355, f1329, 0fBEC3EF15;
fma.rn.f32 f1356, f1328, 0f3F6C835E, f1355;
mul.f32 f1581, f1332, 0fBF3504F3;
mul.f32 f1582, f1333, 0f3F3504F3;
sub.f32 f1359, f1581, f1582;
mul.f32 f1360, f1333, 0fBF3504F3;
fma.rn.f32 f1361, f1332, 0f3F3504F3, f1360;
mul.f32 f1579, f1336, 0fBF6C835E;
mul.f32 f1580, f1337, 0f3EC3EF15;
sub.f32 f1364, f1579, f1580;
mul.f32 f1365, f1337, 0fBF6C835E;
fma.rn.f32 f1366, f1336, 0f3EC3EF15, f1365;
add.f32 f1367, f1265, f1322;
sub.f32 f1369, f1265, f1322;
add.f32 f1578, f1604, f1590;
sub.f32 f1370, f1604, f1590;
add.f32 f1371, f1269, f1340;
sub.f32 f1373, f1269, f1340;
add.f32 f1577, f1603, f1342;
sub.f32 f1374, f1603, f1342;
add.f32 f1375, f1273, f1345;
sub.f32 f1377, f1273, f1345;
add.f32 f1576, f1602, f1346;
sub.f32 f1378, f1602, f1346;
add.f32 f1379, f1277, f1349;
sub.f32 f1381, f1277, f1349;
add.f32 f1575, f1601, f1351;
sub.f32 f1382, f1601, f1351;
sub.f32 f1383, f1267, f1325;
add.f32 f1385, f1267, f1325;
add.f32 f1574, f1268, f1324;
sub.f32 f1386, f1268, f1324;
add.f32 f1387, f1271, f1354;
sub.f32 f1389, f1271, f1354;
add.f32 f1573, f1272, f1356;
sub.f32 f1390, f1272, f1356;
add.f32 f1391, f1275, f1359;
sub.f32 f1393, f1275, f1359;
add.f32 f1572, f1276, f1361;
sub.f32 f1394, f1276, f1361;
add.f32 f1395, f1279, f1364;
sub.f32 f1397, f1279, f1364;
add.f32 f1571, f1280, f1366;
sub.f32 f1398, f1280, f1366;
mul.f32 f1400, f1577, 0f3E47C5C2;
mul.f32 f1570, f1371, 0f3F7B14BE;
sub.f32 f1401, f1570, f1400;
mul.f32 f1402, f1577, 0f3F7B14BE;
fma.rn.f32 f1403, f1371, 0f3E47C5C2, f1402;
mul.f32 f1405, f1576, 0f3EC3EF15;
mul.f32 f1569, f1375, 0f3F6C835E;
sub.f32 f1406, f1569, f1405;
mul.f32 f1407, f1576, 0f3F6C835E;
fma.rn.f32 f1408, f1375, 0f3EC3EF15, f1407;
mul.f32 f1567, f1379, 0f3F54DB31;
mul.f32 f1568, f1575, 0f3F0E39DA;
sub.f32 f1411, f1567, f1568;
mul.f32 f1412, f1575, 0f3F54DB31;
fma.rn.f32 f1413, f1379, 0f3F0E39DA, f1412;
mul.f32 f1414, f1383, 0f3F3504F3;
mul.f32 f1415, f1574, 0f3F3504F3;
sub.f32 f1416, f1414, f1415;
add.f32 f1417, f1414, f1415;
mul.f32 f1419, f1573, 0f3F54DB31;
mul.f32 f1566, f1387, 0f3F0E39DA;
sub.f32 f1420, f1566, f1419;
mul.f32 f1421, f1573, 0f3F0E39DA;
fma.rn.f32 f1422, f1387, 0f3F54DB31, f1421;
mul.f32 f1424, f1572, 0f3F6C835E;
mul.f32 f1565, f1391, 0f3EC3EF15;
sub.f32 f1425, f1565, f1424;
mul.f32 f1426, f1572, 0f3EC3EF15;
fma.rn.f32 f1427, f1391, 0f3F6C835E, f1426;
mul.f32 f1563, f1395, 0f3E47C5C2;
mul.f32 f1564, f1571, 0f3F7B14BE;
sub.f32 f1430, f1563, f1564;
mul.f32 f1431, f1571, 0f3E47C5C2;
fma.rn.f32 f1432, f1395, 0f3F7B14BE, f1431;
mul.f32 f1561, f1373, 0fBE47C5C2;
mul.f32 f1562, f1374, 0f3F7B14BE;
sub.f32 f1435, f1561, f1562;
mul.f32 f1436, f1374, 0fBE47C5C2;
fma.rn.f32 f1437, f1373, 0f3F7B14BE, f1436;
mul.f32 f1559, f1377, 0fBEC3EF15;
mul.f32 f1560, f1378, 0f3F6C835E;
sub.f32 f1440, f1559, f1560;
mul.f32 f1441, f1378, 0fBEC3EF15;
fma.rn.f32 f1442, f1377, 0f3F6C835E, f1441;
mul.f32 f1557, f1381, 0fBF0E39DA;
mul.f32 f1558, f1382, 0f3F54DB31;
sub.f32 f1445, f1557, f1558;
mul.f32 f1446, f1382, 0fBF0E39DA;
fma.rn.f32 f1447, f1381, 0f3F54DB31, f1446;
mul.f32 f1449, f1386, 0f3F3504F3;
mul.f32 f1556, f1385, 0fBF3504F3;
sub.f32 f1450, f1556, f1449;
mul.f32 f1451, f1386, 0fBF3504F3;
fma.rn.f32 f1452, f1385, 0f3F3504F3, f1451;
mul.f32 f1454, f1390, 0f3F0E39DA;
mul.f32 f1555, f1389, 0fBF54DB31;
sub.f32 f1455, f1555, f1454;
mul.f32 f1456, f1390, 0fBF54DB31;
fma.rn.f32 f1457, f1389, 0f3F0E39DA, f1456;
mul.f32 f1459, f1394, 0f3EC3EF15;
mul.f32 f1554, f1393, 0fBF6C835E;
sub.f32 f1460, f1554, f1459;
mul.f32 f1461, f1394, 0fBF6C835E;
fma.rn.f32 f1462, f1393, 0f3EC3EF15, f1461;
mul.f32 f1464, f1398, 0f3E47C5C2;
mul.f32 f1553, f1397, 0fBF7B14BE;
sub.f32 f1465, f1553, f1464;
mul.f32 f1466, f1398, 0fBF7B14BE;
fma.rn.f32 f1467, f1397, 0f3E47C5C2, f1466;
add.f32 %0, f1192, f1367;
add.f32 %1, f1622, f1578;
add.f32 %2, f1196, f1401;
add.f32 %3, f1621, f1403;
add.f32 %4, f1200, f1406;
add.f32 %5, f1620, f1408;
add.f32 %6, f1204, f1411;
add.f32 %7, f1619, f1413;
add.f32 %9, f1618, f1417;
add.f32 %8, f1208, f1416;
add.f32 %11, f1617, f1422;
add.f32 %10, f1212, f1420;
add.f32 %12, f1216, f1425;
add.f32 %13, f1616, f1427;
add.f32 %14, f1220, f1430;
add.f32 %15, f1615, f1432;
sub.f32 %16, f1194, f1370;
add.f32 %17, f1195, f1369;
add.f32 %18, f1198, f1435;
add.f32 %19, f1199, f1437;
add.f32 %21, f1203, f1442;
add.f32 %20, f1202, f1440;
add.f32 %23, f1207, f1447;
add.f32 %22, f1206, f1445;
add.f32 %25, f1211, f1452;
add.f32 %24, f1210, f1450;
add.f32 %26, f1214, f1455;
add.f32 %27, f1215, f1457;
add.f32 %28, f1218, f1460;
add.f32 %29, f1219, f1462;
add.f32 %30, f1222, f1465;
add.f32 %31, f1223, f1467;
sub.f32 %33, f1622, f1578;
sub.f32 %32, f1192, f1367;
sub.f32 %35, f1621, f1403;
sub.f32 %34, f1196, f1401;
sub.f32 %37, f1620, f1408;
sub.f32 %36, f1200, f1406;
sub.f32 %39, f1619, f1413;
sub.f32 %38, f1204, f1411;
sub.f32 %41, f1618, f1417;
sub.f32 %40, f1208, f1416;
sub.f32 %43, f1617, f1422;
sub.f32 %42, f1212, f1420;
sub.f32 %45, f1616, f1427;
sub.f32 %44, f1216, f1425;
sub.f32 %47, f1615, f1432;
sub.f32 %46, f1220, f1430;
sub.f32 %49, f1195, f1369;
add.f32 %48, f1194, f1370;
sub.f32 %51, f1199, f1437;
sub.f32 %50, f1198, f1435;
sub.f32 %53, f1203, f1442;
sub.f32 %52, f1202, f1440;
sub.f32 %55, f1207, f1447;
sub.f32 %54, f1206, f1445;
sub.f32 %57, f1211, f1452;
sub.f32 %56, f1210, f1450;
sub.f32 %59, f1215, f1457;
sub.f32 %58, f1214, f1455;
sub.f32 %61, f1219, f1462;
sub.f32 %60, f1218, f1460;
sub.f32 %63, f1223, f1467;
sub.f32 %62, f1222, f1465;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_1024), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<283, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<841>;
.reg .b32 r<22>;
.reg .b64 rd<9>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 12;
mov.u32 r3, %32;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f32 f65, %35, %56;
add.f32 f66, %36, %58;
sub.f32 f67, %35, %56;
sub.f32 f68, %36, %58;
add.f32 f69, %45, %67;
add.f32 f70, %47, %68;
sub.f32 f71, %45, %67;
sub.f32 f72, %47, %68;
add.f32 f73, f65, f69;
add.f32 f74, f66, f70;
sub.f32 f75, f65, f69;
sub.f32 f76, f66, f70;
sub.f32 f77, f67, f72;
add.f32 f78, f68, f71;
add.f32 f79, f67, f72;
sub.f32 f80, f68, f71;
add.f32 f81, %40, %61;
add.f32 f82, %42, %63;
sub.f32 f83, %40, %61;
sub.f32 f84, %42, %63;
add.f32 f85, %51, %72;
add.f32 f86, %52, %74;
sub.f32 f87, %51, %72;
sub.f32 f88, %52, %74;
add.f32 f89, f81, f85;
add.f32 f90, f82, f86;
sub.f32 f91, f81, f85;
sub.f32 f92, f82, f86;
sub.f32 f93, f83, f88;
add.f32 f94, f84, f87;
add.f32 f95, f83, f88;
sub.f32 f96, f84, f87;
mul.f32 f97, f93, 0f3F3504F3;
mul.f32 f98, f94, 0f3F3504F3;
sub.f32 f99, f97, f98;
add.f32 f100, f97, f98;
mul.f32 f101, f95, 0fBF3504F3;
mul.f32 f102, f96, 0f3F3504F3;
sub.f32 f103, f101, f102;
mul.f32 f104, f96, 0fBF3504F3;
fma.rn.f32 f105, f95, 0f3F3504F3, f104;
add.f32 f106, f73, f89;
add.f32 f107, f74, f90;
sub.f32 f108, f73, f89;
sub.f32 f109, f74, f90;
add.f32 f110, f77, f99;
add.f32 f111, f78, f100;
sub.f32 f112, f77, f99;
sub.f32 f113, f78, f100;
sub.f32 f114, f75, f92;
add.f32 f115, f76, f91;
add.f32 f116, f75, f92;
sub.f32 f117, f76, f91;
add.f32 f118, f79, f103;
add.f32 f119, f80, f105;
sub.f32 f120, f79, f103;
sub.f32 f121, f80, f105;
add.f32 f122, %37, %59;
add.f32 f123, %39, %60;
sub.f32 f124, %37, %59;
sub.f32 f125, %39, %60;
add.f32 f126, %48, %69;
add.f32 f127, %50, %71;
sub.f32 f128, %48, %69;
sub.f32 f129, %50, %71;
add.f32 f130, f122, f126;
add.f32 f131, f123, f127;
sub.f32 f132, f122, f126;
sub.f32 f133, f123, f127;
sub.f32 f134, f124, f129;
add.f32 f135, f125, f128;
add.f32 f136, f124, f129;
sub.f32 f137, f125, f128;
add.f32 f138, %43, %64;
add.f32 f139, %44, %66;
sub.f32 f140, %43, %64;
sub.f32 f141, %44, %66;
add.f32 f142, %53, %75;
add.f32 f143, %55, %76;
sub.f32 f144, %53, %75;
sub.f32 f145, %55, %76;
add.f32 f146, f138, f142;
add.f32 f147, f139, f143;
sub.f32 f148, f138, f142;
sub.f32 f149, f139, f143;
sub.f32 f150, f140, f145;
add.f32 f151, f141, f144;
add.f32 f152, f140, f145;
sub.f32 f153, f141, f144;
mul.f32 f154, f150, 0f3F3504F3;
mul.f32 f155, f151, 0f3F3504F3;
sub.f32 f156, f154, f155;
add.f32 f157, f154, f155;
mul.f32 f158, f152, 0fBF3504F3;
mul.f32 f159, f153, 0f3F3504F3;
sub.f32 f160, f158, f159;
mul.f32 f161, f153, 0fBF3504F3;
fma.rn.f32 f162, f152, 0f3F3504F3, f161;
add.f32 f163, f130, f146;
add.f32 f164, f131, f147;
sub.f32 f165, f130, f146;
sub.f32 f166, f131, f147;
add.f32 f167, f134, f156;
add.f32 f168, f135, f157;
sub.f32 f169, f134, f156;
sub.f32 f170, f135, f157;
sub.f32 f171, f132, f149;
add.f32 f172, f133, f148;
add.f32 f173, f132, f149;
sub.f32 f174, f133, f148;
add.f32 f175, f136, f160;
add.f32 f176, f137, f162;
sub.f32 f177, f136, f160;
sub.f32 f178, f137, f162;
mul.f32 f179, f167, 0f3F6C835E;
mul.f32 f180, f168, 0f3EC3EF15;
sub.f32 f181, f179, f180;
mul.f32 f182, f168, 0f3F6C835E;
fma.rn.f32 f183, f167, 0f3EC3EF15, f182;
mul.f32 f184, f171, 0f3F3504F3;
mul.f32 f185, f172, 0f3F3504F3;
sub.f32 f186, f184, f185;
add.f32 f187, f184, f185;
mul.f32 f188, f175, 0f3EC3EF15;
mul.f32 f189, f176, 0f3F6C835E;
sub.f32 f190, f188, f189;
mul.f32 f191, f176, 0f3EC3EF15;
fma.rn.f32 f192, f175, 0f3F6C835E, f191;
mul.f32 f193, f169, 0fBEC3EF15;
mul.f32 f194, f170, 0f3F6C835E;
sub.f32 f195, f193, f194;
mul.f32 f196, f170, 0fBEC3EF15;
fma.rn.f32 f197, f169, 0f3F6C835E, f196;
mul.f32 f198, f173, 0fBF3504F3;
mul.f32 f199, f174, 0f3F3504F3;
sub.f32 f200, f198, f199;
mul.f32 f201, f174, 0fBF3504F3;
fma.rn.f32 f202, f173, 0f3F3504F3, f201;
mul.f32 f203, f177, 0fBF6C835E;
mul.f32 f204, f178, 0f3EC3EF15;
sub.f32 f205, f203, f204;
mul.f32 f206, f178, 0fBF6C835E;
fma.rn.f32 f207, f177, 0f3EC3EF15, f206;
add.f32 f208, f106, f163;
add.f32 f209, f107, f164;
sub.f32 f210, f106, f163;
sub.f32 f211, f107, f164;
add.f32 f212, f110, f181;
add.f32 f213, f111, f183;
sub.f32 f214, f110, f181;
sub.f32 f215, f111, f183;
add.f32 f216, f114, f186;
add.f32 f217, f115, f187;
sub.f32 f218, f114, f186;
sub.f32 f219, f115, f187;
add.f32 f220, f118, f190;
add.f32 f221, f119, f192;
sub.f32 f222, f118, f190;
sub.f32 f223, f119, f192;
sub.f32 f224, f108, f166;
add.f32 f225, f109, f165;
add.f32 f226, f108, f166;
sub.f32 f227, f109, f165;
add.f32 f228, f112, f195;
add.f32 f229, f113, f197;
sub.f32 f230, f112, f195;
sub.f32 f231, f113, f197;
add.f32 f232, f116, f200;
add.f32 f233, f117, f202;
sub.f32 f234, f116, f200;
sub.f32 f235, f117, f202;
add.f32 f236, f120, f205;
add.f32 f237, f121, f207;
sub.f32 f238, f120, f205;
sub.f32 f239, f121, f207;
and.b32 r6, r5, 63;
shl.b32 r7, r5, 3;
cvt.u64.u32 rd2, r7;
and.b64 rd3, rd2, 504;
mov.u64 rd4, %33;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f240, f241}, [rd5];
mul.f32 f244, f213, f241;
fma.rn.f32 f245, f240, f212, f244;
mul.f32 f246, f212, f241;
mul.f32 f247, f240, f213;
sub.f32 f248, f247, f246;
mul.f32 f249, f240, f240;
mul.f32 f250, f241, f241;
sub.f32 f251, f249, f250;
mul.f32 f252, f241, f240;
fma.rn.f32 f253, f241, f240, f252;
mul.f32 f254, f217, f253;
fma.rn.f32 f255, f251, f216, f254;
mul.f32 f256, f216, f253;
mul.f32 f257, f251, f217;
sub.f32 f258, f257, f256;
mul.f32 f259, f240, f251;
mul.f32 f260, f241, f253;
sub.f32 f261, f259, f260;
mul.f32 f262, f240, f253;
fma.rn.f32 f263, f241, f251, f262;
mul.f32 f264, f221, f263;
fma.rn.f32 f265, f261, f220, f264;
mul.f32 f266, f220, f263;
mul.f32 f267, f261, f221;
sub.f32 f268, f267, f266;
mul.f32 f269, f240, f261;
mul.f32 f270, f241, f263;
sub.f32 f271, f269, f270;
mul.f32 f272, f240, f263;
fma.rn.f32 f273, f241, f261, f272;
mul.f32 f274, f225, f273;
fma.rn.f32 f275, f271, f224, f274;
mul.f32 f276, f224, f273;
mul.f32 f277, f271, f225;
sub.f32 f278, f277, f276;
mul.f32 f279, f240, f271;
mul.f32 f280, f241, f273;
sub.f32 f281, f279, f280;
mul.f32 f282, f240, f273;
fma.rn.f32 f283, f241, f271, f282;
mul.f32 f284, f229, f283;
fma.rn.f32 f285, f281, f228, f284;
mul.f32 f286, f228, f283;
mul.f32 f287, f281, f229;
sub.f32 f288, f287, f286;
mul.f32 f289, f240, f281;
mul.f32 f290, f241, f283;
sub.f32 f291, f289, f290;
mul.f32 f292, f240, f283;
fma.rn.f32 f293, f241, f281, f292;
mul.f32 f294, f233, f293;
fma.rn.f32 f295, f291, f232, f294;
mul.f32 f296, f232, f293;
mul.f32 f297, f291, f233;
sub.f32 f298, f297, f296;
mul.f32 f299, f240, f291;
mul.f32 f300, f241, f293;
sub.f32 f301, f299, f300;
mul.f32 f302, f240, f293;
fma.rn.f32 f303, f241, f291, f302;
mul.f32 f304, f237, f303;
fma.rn.f32 f305, f301, f236, f304;
mul.f32 f306, f236, f303;
mul.f32 f307, f301, f237;
sub.f32 f308, f307, f306;
mul.f32 f309, f240, f301;
mul.f32 f310, f241, f303;
sub.f32 f311, f309, f310;
mul.f32 f312, f240, f303;
fma.rn.f32 f313, f241, f301, f312;
mul.f32 f314, f211, f313;
fma.rn.f32 f315, f311, f210, f314;
mul.f32 f316, f210, f313;
mul.f32 f317, f311, f211;
sub.f32 f318, f317, f316;
mul.f32 f319, f240, f311;
mul.f32 f320, f241, f313;
sub.f32 f321, f319, f320;
mul.f32 f322, f240, f313;
fma.rn.f32 f323, f241, f311, f322;
mul.f32 f324, f215, f323;
fma.rn.f32 f325, f321, f214, f324;
mul.f32 f326, f214, f323;
mul.f32 f327, f321, f215;
sub.f32 f328, f327, f326;
mul.f32 f329, f240, f321;
mul.f32 f330, f241, f323;
sub.f32 f331, f329, f330;
mul.f32 f332, f240, f323;
fma.rn.f32 f333, f241, f321, f332;
mul.f32 f334, f219, f333;
fma.rn.f32 f335, f331, f218, f334;
mul.f32 f336, f218, f333;
mul.f32 f337, f331, f219;
sub.f32 f338, f337, f336;
mul.f32 f339, f240, f331;
mul.f32 f340, f241, f333;
sub.f32 f341, f339, f340;
mul.f32 f342, f240, f333;
fma.rn.f32 f343, f241, f331, f342;
mul.f32 f344, f223, f343;
fma.rn.f32 f345, f341, f222, f344;
mul.f32 f346, f222, f343;
mul.f32 f347, f341, f223;
sub.f32 f348, f347, f346;
mul.f32 f349, f240, f341;
mul.f32 f350, f241, f343;
sub.f32 f351, f349, f350;
mul.f32 f352, f240, f343;
fma.rn.f32 f353, f241, f341, f352;
mul.f32 f354, f227, f353;
fma.rn.f32 f355, f351, f226, f354;
mul.f32 f356, f226, f353;
mul.f32 f357, f351, f227;
sub.f32 f358, f357, f356;
mul.f32 f359, f240, f351;
mul.f32 f360, f241, f353;
sub.f32 f361, f359, f360;
mul.f32 f362, f240, f353;
fma.rn.f32 f363, f241, f351, f362;
mul.f32 f364, f231, f363;
fma.rn.f32 f365, f361, f230, f364;
mul.f32 f366, f230, f363;
mul.f32 f367, f361, f231;
sub.f32 f368, f367, f366;
mul.f32 f369, f240, f361;
mul.f32 f370, f241, f363;
sub.f32 f371, f369, f370;
mul.f32 f372, f240, f363;
fma.rn.f32 f373, f241, f361, f372;
mul.f32 f374, f235, f373;
fma.rn.f32 f375, f371, f234, f374;
mul.f32 f376, f234, f373;
mul.f32 f377, f371, f235;
sub.f32 f378, f377, f376;
mul.f32 f379, f240, f371;
mul.f32 f380, f241, f373;
sub.f32 f381, f379, f380;
mul.f32 f382, f240, f373;
fma.rn.f32 f383, f241, f371, f382;
mul.f32 f384, f239, f383;
fma.rn.f32 f385, f381, f238, f384;
mul.f32 f386, f238, f383;
mul.f32 f387, f381, f239;
sub.f32 f388, f387, f386;
shl.b32 r8, r5, 6;
and.b32 r9, r8, -4096;
add.s32 r10, r4, r9;
barrier.sync 0;
and.b32 r11, r8, 4032;
add.s32 r12, r10, r11;
st.shared.v4.f32 [r12], {f208, f245, f255, f265};
st.shared.v4.f32 [r12+16], {f275, f285, f295, f305};
st.shared.v4.f32 [r12+32], {f315, f325, f335, f345};
st.shared.v4.f32 [r12+48], {f355, f365, f375, f385};
barrier.sync 0;
mad.lo.s32 r13, r6, -60, r12;
ld.shared.f32 f389, [r13];
ld.shared.f32 f390, [r13+256];
ld.shared.f32 f391, [r13+512];
ld.shared.f32 f392, [r13+768];
ld.shared.f32 f393, [r13+1024];
ld.shared.f32 f394, [r13+1280];
ld.shared.f32 f395, [r13+1536];
ld.shared.f32 f396, [r13+1792];
ld.shared.f32 f397, [r13+2048];
ld.shared.f32 f398, [r13+2304];
ld.shared.f32 f399, [r13+2560];
ld.shared.f32 f400, [r13+2816];
ld.shared.f32 f401, [r13+3072];
ld.shared.f32 f402, [r13+3328];
ld.shared.f32 f403, [r13+3584];
ld.shared.f32 f404, [r13+3840];
barrier.sync 0;
st.shared.v4.f32 [r12], {f209, f248, f258, f268};
st.shared.v4.f32 [r12+16], {f278, f288, f298, f308};
st.shared.v4.f32 [r12+32], {f318, f328, f338, f348};
st.shared.v4.f32 [r12+48], {f358, f368, f378, f388};
barrier.sync 0;
ld.shared.f32 f405, [r13];
ld.shared.f32 f406, [r13+256];
ld.shared.f32 f407, [r13+512];
ld.shared.f32 f408, [r13+768];
ld.shared.f32 f409, [r13+1024];
ld.shared.f32 f410, [r13+1280];
ld.shared.f32 f411, [r13+1536];
ld.shared.f32 f412, [r13+1792];
ld.shared.f32 f413, [r13+2048];
ld.shared.f32 f414, [r13+2304];
ld.shared.f32 f415, [r13+2560];
ld.shared.f32 f416, [r13+2816];
ld.shared.f32 f417, [r13+3072];
ld.shared.f32 f418, [r13+3328];
ld.shared.f32 f419, [r13+3584];
ld.shared.f32 f420, [r13+3840];
add.f32 f421, f389, f397;
add.f32 f422, f405, f413;
sub.f32 f423, f389, f397;
sub.f32 f424, f405, f413;
add.f32 f425, f393, f401;
add.f32 f426, f409, f417;
sub.f32 f427, f393, f401;
sub.f32 f428, f409, f417;
add.f32 f429, f421, f425;
add.f32 f430, f422, f426;
sub.f32 f431, f421, f425;
sub.f32 f432, f422, f426;
sub.f32 f433, f423, f428;
add.f32 f434, f424, f427;
add.f32 f435, f423, f428;
sub.f32 f436, f424, f427;
add.f32 f437, f391, f399;
add.f32 f438, f407, f415;
sub.f32 f439, f391, f399;
sub.f32 f440, f407, f415;
add.f32 f441, f395, f403;
add.f32 f442, f411, f419;
sub.f32 f443, f395, f403;
sub.f32 f444, f411, f419;
add.f32 f445, f437, f441;
add.f32 f446, f438, f442;
sub.f32 f447, f437, f441;
sub.f32 f448, f438, f442;
sub.f32 f449, f439, f444;
add.f32 f450, f440, f443;
add.f32 f451, f439, f444;
sub.f32 f452, f440, f443;
mul.f32 f453, f449, 0f3F3504F3;
mul.f32 f454, f450, 0f3F3504F3;
sub.f32 f455, f453, f454;
add.f32 f456, f453, f454;
mul.f32 f457, f451, 0fBF3504F3;
mul.f32 f458, f452, 0f3F3504F3;
sub.f32 f459, f457, f458;
mul.f32 f460, f452, 0fBF3504F3;
fma.rn.f32 f461, f451, 0f3F3504F3, f460;
add.f32 f462, f429, f445;
add.f32 f463, f430, f446;
sub.f32 f464, f429, f445;
sub.f32 f465, f430, f446;
add.f32 f466, f433, f455;
add.f32 f467, f434, f456;
sub.f32 f468, f433, f455;
sub.f32 f469, f434, f456;
sub.f32 f470, f431, f448;
add.f32 f471, f432, f447;
add.f32 f472, f431, f448;
sub.f32 f473, f432, f447;
add.f32 f474, f435, f459;
add.f32 f475, f436, f461;
sub.f32 f476, f435, f459;
sub.f32 f477, f436, f461;
add.f32 f478, f390, f398;
add.f32 f479, f406, f414;
sub.f32 f480, f390, f398;
sub.f32 f481, f406, f414;
add.f32 f482, f394, f402;
add.f32 f483, f410, f418;
sub.f32 f484, f394, f402;
sub.f32 f485, f410, f418;
add.f32 f486, f478, f482;
add.f32 f487, f479, f483;
sub.f32 f488, f478, f482;
sub.f32 f489, f479, f483;
sub.f32 f490, f480, f485;
add.f32 f491, f481, f484;
add.f32 f492, f480, f485;
sub.f32 f493, f481, f484;
add.f32 f494, f392, f400;
add.f32 f495, f408, f416;
sub.f32 f496, f392, f400;
sub.f32 f497, f408, f416;
add.f32 f498, f396, f404;
add.f32 f499, f412, f420;
sub.f32 f500, f396, f404;
sub.f32 f501, f412, f420;
add.f32 f502, f494, f498;
add.f32 f503, f495, f499;
sub.f32 f504, f494, f498;
sub.f32 f505, f495, f499;
sub.f32 f506, f496, f501;
add.f32 f507, f497, f500;
add.f32 f508, f496, f501;
sub.f32 f509, f497, f500;
mul.f32 f510, f506, 0f3F3504F3;
mul.f32 f511, f507, 0f3F3504F3;
sub.f32 f512, f510, f511;
add.f32 f513, f510, f511;
mul.f32 f514, f508, 0fBF3504F3;
mul.f32 f515, f509, 0f3F3504F3;
sub.f32 f516, f514, f515;
mul.f32 f517, f509, 0fBF3504F3;
fma.rn.f32 f518, f508, 0f3F3504F3, f517;
add.f32 f519, f486, f502;
add.f32 f520, f487, f503;
sub.f32 f521, f486, f502;
sub.f32 f522, f487, f503;
add.f32 f523, f490, f512;
add.f32 f524, f491, f513;
sub.f32 f525, f490, f512;
sub.f32 f526, f491, f513;
sub.f32 f527, f488, f505;
add.f32 f528, f489, f504;
add.f32 f529, f488, f505;
sub.f32 f530, f489, f504;
add.f32 f531, f492, f516;
add.f32 f532, f493, f518;
sub.f32 f533, f492, f516;
sub.f32 f534, f493, f518;
mul.f32 f535, f523, 0f3F6C835E;
mul.f32 f536, f524, 0f3EC3EF15;
sub.f32 f537, f535, f536;
mul.f32 f538, f524, 0f3F6C835E;
fma.rn.f32 f539, f523, 0f3EC3EF15, f538;
mul.f32 f540, f527, 0f3F3504F3;
mul.f32 f541, f528, 0f3F3504F3;
sub.f32 f542, f540, f541;
add.f32 f543, f540, f541;
mul.f32 f544, f531, 0f3EC3EF15;
mul.f32 f545, f532, 0f3F6C835E;
sub.f32 f546, f544, f545;
mul.f32 f547, f532, 0f3EC3EF15;
fma.rn.f32 f548, f531, 0f3F6C835E, f547;
mul.f32 f549, f525, 0fBEC3EF15;
mul.f32 f550, f526, 0f3F6C835E;
sub.f32 f551, f549, f550;
mul.f32 f552, f526, 0fBEC3EF15;
fma.rn.f32 f553, f525, 0f3F6C835E, f552;
mul.f32 f554, f529, 0fBF3504F3;
mul.f32 f555, f530, 0f3F3504F3;
sub.f32 f556, f554, f555;
mul.f32 f557, f530, 0fBF3504F3;
fma.rn.f32 f558, f529, 0f3F3504F3, f557;
mul.f32 f559, f533, 0fBF6C835E;
mul.f32 f560, f534, 0f3EC3EF15;
sub.f32 f561, f559, f560;
mul.f32 f562, f534, 0fBF6C835E;
fma.rn.f32 f563, f533, 0f3EC3EF15, f562;
add.f32 f564, f462, f519;
add.f32 f565, f463, f520;
sub.f32 f566, f462, f519;
sub.f32 f567, f463, f520;
add.f32 f568, f466, f537;
add.f32 f569, f467, f539;
sub.f32 f570, f466, f537;
sub.f32 f571, f467, f539;
add.f32 f572, f470, f542;
add.f32 f573, f471, f543;
sub.f32 f574, f470, f542;
sub.f32 f575, f471, f543;
add.f32 f576, f474, f546;
add.f32 f577, f475, f548;
sub.f32 f578, f474, f546;
sub.f32 f579, f475, f548;
sub.f32 f580, f464, f522;
add.f32 f581, f465, f521;
add.f32 f582, f464, f522;
sub.f32 f583, f465, f521;
add.f32 f584, f468, f551;
add.f32 f585, f469, f553;
sub.f32 f586, f468, f551;
sub.f32 f587, f469, f553;
add.f32 f588, f472, f556;
add.f32 f589, f473, f558;
sub.f32 f590, f472, f556;
sub.f32 f591, f473, f558;
add.f32 f592, f476, f561;
add.f32 f593, f477, f563;
sub.f32 f594, f476, f561;
sub.f32 f595, f477, f563;
and.b32 r14, r5, 48;
bfe.u32 r15, r5, 4, 2;
mul.wide.u32 rd6, r15, 8;
mov.u64 rd7, %34;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f596, f597}, [rd8];
mul.f32 f600, f569, f597;
fma.rn.f32 f601, f596, f568, f600;
mul.f32 f602, f568, f597;
mul.f32 f603, f596, f569;
sub.f32 f604, f603, f602;
mul.f32 f605, f596, f596;
mul.f32 f606, f597, f597;
sub.f32 f607, f605, f606;
mul.f32 f608, f597, f596;
fma.rn.f32 f609, f597, f596, f608;
mul.f32 f610, f573, f609;
fma.rn.f32 f611, f607, f572, f610;
mul.f32 f612, f572, f609;
mul.f32 f613, f607, f573;
sub.f32 f614, f613, f612;
mul.f32 f615, f596, f607;
mul.f32 f616, f597, f609;
sub.f32 f617, f615, f616;
mul.f32 f618, f596, f609;
fma.rn.f32 f619, f597, f607, f618;
mul.f32 f620, f577, f619;
fma.rn.f32 f621, f617, f576, f620;
mul.f32 f622, f576, f619;
mul.f32 f623, f617, f577;
sub.f32 f624, f623, f622;
mul.f32 f625, f596, f617;
mul.f32 f626, f597, f619;
sub.f32 f627, f625, f626;
mul.f32 f628, f596, f619;
fma.rn.f32 f629, f597, f617, f628;
mul.f32 f630, f581, f629;
fma.rn.f32 f631, f627, f580, f630;
mul.f32 f632, f580, f629;
mul.f32 f633, f627, f581;
sub.f32 f634, f633, f632;
mul.f32 f635, f596, f627;
mul.f32 f636, f597, f629;
sub.f32 f637, f635, f636;
mul.f32 f638, f596, f629;
fma.rn.f32 f639, f597, f627, f638;
mul.f32 f640, f585, f639;
fma.rn.f32 f641, f637, f584, f640;
mul.f32 f642, f584, f639;
mul.f32 f643, f637, f585;
sub.f32 f644, f643, f642;
mul.f32 f645, f596, f637;
mul.f32 f646, f597, f639;
sub.f32 f647, f645, f646;
mul.f32 f648, f596, f639;
fma.rn.f32 f649, f597, f637, f648;
mul.f32 f650, f589, f649;
fma.rn.f32 f651, f647, f588, f650;
mul.f32 f652, f588, f649;
mul.f32 f653, f647, f589;
sub.f32 f654, f653, f652;
mul.f32 f655, f596, f647;
mul.f32 f656, f597, f649;
sub.f32 f657, f655, f656;
mul.f32 f658, f596, f649;
fma.rn.f32 f659, f597, f647, f658;
mul.f32 f660, f593, f659;
fma.rn.f32 f661, f657, f592, f660;
mul.f32 f662, f592, f659;
mul.f32 f663, f657, f593;
sub.f32 f664, f663, f662;
mul.f32 f665, f596, f657;
mul.f32 f666, f597, f659;
sub.f32 f667, f665, f666;
mul.f32 f668, f596, f659;
fma.rn.f32 f669, f597, f657, f668;
mul.f32 f670, f567, f669;
fma.rn.f32 f671, f667, f566, f670;
mul.f32 f672, f566, f669;
mul.f32 f673, f667, f567;
sub.f32 f674, f673, f672;
mul.f32 f675, f596, f667;
mul.f32 f676, f597, f669;
sub.f32 f677, f675, f676;
mul.f32 f678, f596, f669;
fma.rn.f32 f679, f597, f667, f678;
mul.f32 f680, f571, f679;
fma.rn.f32 f681, f677, f570, f680;
mul.f32 f682, f570, f679;
mul.f32 f683, f677, f571;
sub.f32 f684, f683, f682;
mul.f32 f685, f596, f677;
mul.f32 f686, f597, f679;
sub.f32 f687, f685, f686;
mul.f32 f688, f596, f679;
fma.rn.f32 f689, f597, f677, f688;
mul.f32 f690, f575, f689;
fma.rn.f32 f691, f687, f574, f690;
mul.f32 f692, f574, f689;
mul.f32 f693, f687, f575;
sub.f32 f694, f693, f692;
mul.f32 f695, f596, f687;
mul.f32 f696, f597, f689;
sub.f32 f697, f695, f696;
mul.f32 f698, f596, f689;
fma.rn.f32 f699, f597, f687, f698;
mul.f32 f700, f579, f699;
fma.rn.f32 f701, f697, f578, f700;
mul.f32 f702, f578, f699;
mul.f32 f703, f697, f579;
sub.f32 f704, f703, f702;
mul.f32 f705, f596, f697;
mul.f32 f706, f597, f699;
sub.f32 f707, f705, f706;
mul.f32 f708, f596, f699;
fma.rn.f32 f709, f597, f697, f708;
mul.f32 f710, f583, f709;
fma.rn.f32 f711, f707, f582, f710;
mul.f32 f712, f582, f709;
mul.f32 f713, f707, f583;
sub.f32 f714, f713, f712;
mul.f32 f715, f596, f707;
mul.f32 f716, f597, f709;
sub.f32 f717, f715, f716;
mul.f32 f718, f596, f709;
fma.rn.f32 f719, f597, f707, f718;
mul.f32 f720, f587, f719;
fma.rn.f32 f721, f717, f586, f720;
mul.f32 f722, f586, f719;
mul.f32 f723, f717, f587;
sub.f32 f724, f723, f722;
mul.f32 f725, f596, f717;
mul.f32 f726, f597, f719;
sub.f32 f727, f725, f726;
mul.f32 f728, f596, f719;
fma.rn.f32 f729, f597, f717, f728;
mul.f32 f730, f591, f729;
fma.rn.f32 f731, f727, f590, f730;
mul.f32 f732, f590, f729;
mul.f32 f733, f727, f591;
sub.f32 f734, f733, f732;
mul.f32 f735, f596, f727;
mul.f32 f736, f597, f729;
sub.f32 f737, f735, f736;
mul.f32 f738, f596, f729;
fma.rn.f32 f739, f597, f727, f738;
mul.f32 f740, f595, f739;
fma.rn.f32 f741, f737, f594, f740;
mul.f32 f742, f594, f739;
mul.f32 f743, f737, f595;
sub.f32 f744, f743, f742;
shl.b32 r16, r5, 2;
and.b32 r17, r16, 60;
add.s32 r18, r10, r17;
barrier.sync 0;
and.b32 r19, r8, 3072;
add.s32 r20, r18, r19;
st.shared.f32 [r20], f564;
st.shared.f32 [r20+64], f601;
st.shared.f32 [r20+128], f611;
st.shared.f32 [r20+192], f621;
st.shared.f32 [r20+256], f631;
st.shared.f32 [r20+320], f641;
st.shared.f32 [r20+384], f651;
st.shared.f32 [r20+448], f661;
st.shared.f32 [r20+512], f671;
st.shared.f32 [r20+576], f681;
st.shared.f32 [r20+640], f691;
st.shared.f32 [r20+704], f701;
st.shared.f32 [r20+768], f711;
st.shared.f32 [r20+832], f721;
st.shared.f32 [r20+896], f731;
st.shared.f32 [r20+960], f741;
barrier.sync 0;
mad.lo.s32 r21, r14, -60, r20;
ld.shared.f32 f745, [r21];
ld.shared.f32 f746, [r21+256];
ld.shared.f32 f747, [r21+512];
ld.shared.f32 f748, [r21+768];
ld.shared.f32 f749, [r21+1024];
ld.shared.f32 f750, [r21+1280];
ld.shared.f32 f751, [r21+1536];
ld.shared.f32 f752, [r21+1792];
ld.shared.f32 f753, [r21+2048];
ld.shared.f32 f754, [r21+2304];
ld.shared.f32 f755, [r21+2560];
ld.shared.f32 f756, [r21+2816];
ld.shared.f32 f757, [r21+3072];
ld.shared.f32 f758, [r21+3328];
ld.shared.f32 f759, [r21+3584];
ld.shared.f32 f760, [r21+3840];
barrier.sync 0;
st.shared.f32 [r20], f565;
st.shared.f32 [r20+64], f604;
st.shared.f32 [r20+128], f614;
st.shared.f32 [r20+192], f624;
st.shared.f32 [r20+256], f634;
st.shared.f32 [r20+320], f644;
st.shared.f32 [r20+384], f654;
st.shared.f32 [r20+448], f664;
st.shared.f32 [r20+512], f674;
st.shared.f32 [r20+576], f684;
st.shared.f32 [r20+640], f694;
st.shared.f32 [r20+704], f704;
st.shared.f32 [r20+768], f714;
st.shared.f32 [r20+832], f724;
st.shared.f32 [r20+896], f734;
st.shared.f32 [r20+960], f744;
barrier.sync 0;
ld.shared.f32 f761, [r21];
ld.shared.f32 f762, [r21+256];
ld.shared.f32 f763, [r21+512];
ld.shared.f32 f764, [r21+768];
ld.shared.f32 f765, [r21+1024];
ld.shared.f32 f766, [r21+1280];
ld.shared.f32 f767, [r21+1536];
ld.shared.f32 f768, [r21+1792];
ld.shared.f32 f769, [r21+2048];
ld.shared.f32 f770, [r21+2304];
ld.shared.f32 f771, [r21+2560];
ld.shared.f32 f772, [r21+2816];
ld.shared.f32 f773, [r21+3072];
ld.shared.f32 f774, [r21+3328];
ld.shared.f32 f775, [r21+3584];
ld.shared.f32 f776, [r21+3840];
add.f32 f777, f745, f753;
add.f32 f778, f761, f769;
sub.f32 f779, f745, f753;
sub.f32 f780, f761, f769;
add.f32 f781, f749, f757;
add.f32 f782, f765, f773;
sub.f32 f783, f749, f757;
sub.f32 f784, f765, f773;
add.f32 f785, f746, f754;
add.f32 f786, f762, f770;
sub.f32 f787, f746, f754;
sub.f32 f788, f762, f770;
add.f32 f789, f750, f758;
add.f32 f790, f766, f774;
sub.f32 f791, f750, f758;
sub.f32 f792, f766, f774;
add.f32 f793, f747, f755;
add.f32 f794, f763, f771;
sub.f32 f795, f747, f755;
sub.f32 f796, f763, f771;
add.f32 f797, f751, f759;
add.f32 f798, f767, f775;
sub.f32 f799, f751, f759;
sub.f32 f800, f767, f775;
add.f32 f801, f748, f756;
add.f32 f802, f764, f772;
sub.f32 f803, f748, f756;
sub.f32 f804, f764, f772;
add.f32 f805, f752, f760;
add.f32 f806, f768, f776;
sub.f32 f807, f752, f760;
sub.f32 f808, f768, f776;
add.f32 %0, f777, f781;
add.f32 %1, f778, f782;
add.f32 %2, f785, f789;
add.f32 %3, f786, f790;
add.f32 %4, f793, f797;
add.f32 %5, f794, f798;
add.f32 %6, f801, f805;
add.f32 %7, f802, f806;
add.f32 %9, f780, f783;
sub.f32 %8, f779, f784;
add.f32 %11, f788, f791;
sub.f32 %10, f787, f792;
add.f32 %13, f796, f799;
sub.f32 %12, f795, f800;
add.f32 %15, f804, f807;
sub.f32 %14, f803, f808;
sub.f32 %16, f777, f781;
sub.f32 %17, f778, f782;
sub.f32 %18, f785, f789;
sub.f32 %19, f786, f790;
sub.f32 %20, f793, f797;
sub.f32 %21, f794, f798;
sub.f32 %22, f801, f805;
sub.f32 %23, f802, f806;
sub.f32 %25, f780, f783;
add.f32 %24, f779, f784;
sub.f32 %27, f788, f791;
add.f32 %26, f787, f792;
sub.f32 %29, f796, f799;
add.f32 %28, f795, f800;
sub.f32 %31, f804, f807;
add.f32 %30, f803, f808;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_1024), "l"(lut_sp_16_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<286, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<475>;
.reg .b32 r<28>;
.reg .b64 rd<12>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 12;
mov.u32 r3, %16;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f32 f33, %20, %30;
add.f32 f34, %21, %32;
sub.f32 f35, %20, %30;
sub.f32 f36, %21, %32;
add.f32 f37, %25, %36;
add.f32 f38, %27, %37;
sub.f32 f39, %25, %36;
sub.f32 f40, %27, %37;
add.f32 f41, f33, f37;
add.f32 f42, f34, f38;
sub.f32 f43, f33, f37;
sub.f32 f44, f34, f38;
sub.f32 f45, f35, f40;
add.f32 f46, f36, f39;
add.f32 f47, f35, f40;
sub.f32 f48, f36, f39;
add.f32 f49, %22, %33;
add.f32 f50, %24, %35;
sub.f32 f51, %22, %33;
sub.f32 f52, %24, %35;
add.f32 f53, %28, %38;
add.f32 f54, %29, %39;
sub.f32 f55, %28, %38;
sub.f32 f56, %29, %39;
add.f32 f57, f49, f53;
add.f32 f58, f50, f54;
sub.f32 f59, f49, f53;
sub.f32 f60, f50, f54;
sub.f32 f61, f51, f56;
add.f32 f62, f52, f55;
add.f32 f63, f51, f56;
sub.f32 f64, f52, f55;
mul.f32 f65, f61, 0f3F3504F3;
mul.f32 f66, f62, 0f3F3504F3;
sub.f32 f67, f65, f66;
add.f32 f68, f65, f66;
mul.f32 f69, f63, 0fBF3504F3;
mul.f32 f70, f64, 0f3F3504F3;
sub.f32 f71, f69, f70;
mul.f32 f72, f64, 0fBF3504F3;
fma.rn.f32 f73, f63, 0f3F3504F3, f72;
add.f32 f74, f41, f57;
add.f32 f75, f42, f58;
sub.f32 f76, f41, f57;
sub.f32 f77, f42, f58;
add.f32 f78, f45, f67;
add.f32 f79, f46, f68;
sub.f32 f80, f45, f67;
sub.f32 f81, f46, f68;
sub.f32 f82, f43, f60;
add.f32 f83, f44, f59;
add.f32 f84, f43, f60;
sub.f32 f85, f44, f59;
add.f32 f86, f47, f71;
add.f32 f87, f48, f73;
sub.f32 f88, f47, f71;
sub.f32 f89, f48, f73;
and.b32 r6, r5, 127;
shl.b32 r7, r5, 3;
cvt.u64.u32 rd2, r7;
and.b64 rd3, rd2, 1016;
mov.u64 rd4, %17;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f90, f91}, [rd5];
mul.f32 f94, f79, f91;
fma.rn.f32 f95, f90, f78, f94;
mul.f32 f96, f78, f91;
mul.f32 f97, f90, f79;
sub.f32 f98, f97, f96;
mul.f32 f99, f90, f90;
mul.f32 f100, f91, f91;
sub.f32 f101, f99, f100;
mul.f32 f102, f91, f90;
fma.rn.f32 f103, f91, f90, f102;
mul.f32 f104, f83, f103;
fma.rn.f32 f105, f101, f82, f104;
mul.f32 f106, f82, f103;
mul.f32 f107, f101, f83;
sub.f32 f108, f107, f106;
mul.f32 f109, f90, f101;
mul.f32 f110, f91, f103;
sub.f32 f111, f109, f110;
mul.f32 f112, f90, f103;
fma.rn.f32 f113, f91, f101, f112;
mul.f32 f114, f87, f113;
fma.rn.f32 f115, f111, f86, f114;
mul.f32 f116, f86, f113;
mul.f32 f117, f111, f87;
sub.f32 f118, f117, f116;
mul.f32 f119, f90, f111;
mul.f32 f120, f91, f113;
sub.f32 f121, f119, f120;
mul.f32 f122, f90, f113;
fma.rn.f32 f123, f91, f111, f122;
mul.f32 f124, f77, f123;
fma.rn.f32 f125, f121, f76, f124;
mul.f32 f126, f76, f123;
mul.f32 f127, f121, f77;
sub.f32 f128, f127, f126;
mul.f32 f129, f90, f121;
mul.f32 f130, f91, f123;
sub.f32 f131, f129, f130;
mul.f32 f132, f90, f123;
fma.rn.f32 f133, f91, f121, f132;
mul.f32 f134, f81, f133;
fma.rn.f32 f135, f131, f80, f134;
mul.f32 f136, f80, f133;
mul.f32 f137, f131, f81;
sub.f32 f138, f137, f136;
mul.f32 f139, f90, f131;
mul.f32 f140, f91, f133;
sub.f32 f141, f139, f140;
mul.f32 f142, f90, f133;
fma.rn.f32 f143, f91, f131, f142;
mul.f32 f144, f85, f143;
fma.rn.f32 f145, f141, f84, f144;
mul.f32 f146, f84, f143;
mul.f32 f147, f141, f85;
sub.f32 f148, f147, f146;
mul.f32 f149, f90, f141;
mul.f32 f150, f91, f143;
sub.f32 f151, f149, f150;
mul.f32 f152, f90, f143;
fma.rn.f32 f153, f91, f141, f152;
mul.f32 f154, f89, f153;
fma.rn.f32 f155, f151, f88, f154;
mul.f32 f156, f88, f153;
mul.f32 f157, f151, f89;
sub.f32 f158, f157, f156;
shl.b32 r8, r5, 5;
and.b32 r9, r8, -4096;
add.s32 r10, r4, r9;
barrier.sync 0;
and.b32 r11, r8, 4064;
add.s32 r12, r10, r11;
st.shared.v4.f32 [r12], {f74, f95, f105, f115};
st.shared.v4.f32 [r12+16], {f125, f135, f145, f155};
barrier.sync 0;
mad.lo.s32 r13, r6, -28, r12;
ld.shared.f32 f159, [r13];
ld.shared.f32 f160, [r13+512];
ld.shared.f32 f161, [r13+1024];
ld.shared.f32 f162, [r13+1536];
ld.shared.f32 f163, [r13+2048];
ld.shared.f32 f164, [r13+2560];
ld.shared.f32 f165, [r13+3072];
ld.shared.f32 f166, [r13+3584];
barrier.sync 0;
st.shared.v4.f32 [r12], {f75, f98, f108, f118};
st.shared.v4.f32 [r12+16], {f128, f138, f148, f158};
barrier.sync 0;
ld.shared.f32 f167, [r13];
ld.shared.f32 f168, [r13+512];
ld.shared.f32 f169, [r13+1024];
ld.shared.f32 f170, [r13+1536];
ld.shared.f32 f171, [r13+2048];
ld.shared.f32 f172, [r13+2560];
ld.shared.f32 f173, [r13+3072];
ld.shared.f32 f174, [r13+3584];
add.f32 f175, f159, f163;
add.f32 f176, f167, f171;
sub.f32 f177, f159, f163;
sub.f32 f178, f167, f171;
add.f32 f179, f161, f165;
add.f32 f180, f169, f173;
sub.f32 f181, f161, f165;
sub.f32 f182, f169, f173;
add.f32 f183, f175, f179;
add.f32 f184, f176, f180;
sub.f32 f185, f175, f179;
sub.f32 f186, f176, f180;
sub.f32 f187, f177, f182;
add.f32 f188, f178, f181;
add.f32 f189, f177, f182;
sub.f32 f190, f178, f181;
add.f32 f191, f160, f164;
add.f32 f192, f168, f172;
sub.f32 f193, f160, f164;
sub.f32 f194, f168, f172;
add.f32 f195, f162, f166;
add.f32 f196, f170, f174;
sub.f32 f197, f162, f166;
sub.f32 f198, f170, f174;
add.f32 f199, f191, f195;
add.f32 f200, f192, f196;
sub.f32 f201, f191, f195;
sub.f32 f202, f192, f196;
sub.f32 f203, f193, f198;
add.f32 f204, f194, f197;
add.f32 f205, f193, f198;
sub.f32 f206, f194, f197;
mul.f32 f207, f203, 0f3F3504F3;
mul.f32 f208, f204, 0f3F3504F3;
sub.f32 f209, f207, f208;
add.f32 f210, f207, f208;
mul.f32 f211, f205, 0fBF3504F3;
mul.f32 f212, f206, 0f3F3504F3;
sub.f32 f213, f211, f212;
mul.f32 f214, f206, 0fBF3504F3;
fma.rn.f32 f215, f205, 0f3F3504F3, f214;
add.f32 f216, f183, f199;
add.f32 f217, f184, f200;
sub.f32 f218, f183, f199;
sub.f32 f219, f184, f200;
add.f32 f220, f187, f209;
add.f32 f221, f188, f210;
sub.f32 f222, f187, f209;
sub.f32 f223, f188, f210;
sub.f32 f224, f185, f202;
add.f32 f225, f186, f201;
add.f32 f226, f185, f202;
sub.f32 f227, f186, f201;
add.f32 f228, f189, f213;
add.f32 f229, f190, f215;
sub.f32 f230, f189, f213;
sub.f32 f231, f190, f215;
and.b32 r14, r5, 120;
cvt.u64.u32 rd6, r14;
mov.u64 rd7, %18;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f232, f233}, [rd8];
mul.f32 f236, f221, f233;
fma.rn.f32 f237, f232, f220, f236;
mul.f32 f238, f220, f233;
mul.f32 f239, f232, f221;
sub.f32 f240, f239, f238;
mul.f32 f241, f232, f232;
mul.f32 f242, f233, f233;
sub.f32 f243, f241, f242;
mul.f32 f244, f233, f232;
fma.rn.f32 f245, f233, f232, f244;
mul.f32 f246, f225, f245;
fma.rn.f32 f247, f243, f224, f246;
mul.f32 f248, f224, f245;
mul.f32 f249, f243, f225;
sub.f32 f250, f249, f248;
mul.f32 f251, f232, f243;
mul.f32 f252, f233, f245;
sub.f32 f253, f251, f252;
mul.f32 f254, f232, f245;
fma.rn.f32 f255, f233, f243, f254;
mul.f32 f256, f229, f255;
fma.rn.f32 f257, f253, f228, f256;
mul.f32 f258, f228, f255;
mul.f32 f259, f253, f229;
sub.f32 f260, f259, f258;
mul.f32 f261, f232, f253;
mul.f32 f262, f233, f255;
sub.f32 f263, f261, f262;
mul.f32 f264, f232, f255;
fma.rn.f32 f265, f233, f253, f264;
mul.f32 f266, f219, f265;
fma.rn.f32 f267, f263, f218, f266;
mul.f32 f268, f218, f265;
mul.f32 f269, f263, f219;
sub.f32 f270, f269, f268;
mul.f32 f271, f232, f263;
mul.f32 f272, f233, f265;
sub.f32 f273, f271, f272;
mul.f32 f274, f232, f265;
fma.rn.f32 f275, f233, f263, f274;
mul.f32 f276, f223, f275;
fma.rn.f32 f277, f273, f222, f276;
mul.f32 f278, f222, f275;
mul.f32 f279, f273, f223;
sub.f32 f280, f279, f278;
mul.f32 f281, f232, f273;
mul.f32 f282, f233, f275;
sub.f32 f283, f281, f282;
mul.f32 f284, f232, f275;
fma.rn.f32 f285, f233, f273, f284;
mul.f32 f286, f227, f285;
fma.rn.f32 f287, f283, f226, f286;
mul.f32 f288, f226, f285;
mul.f32 f289, f283, f227;
sub.f32 f290, f289, f288;
mul.f32 f291, f232, f283;
mul.f32 f292, f233, f285;
sub.f32 f293, f291, f292;
mul.f32 f294, f232, f285;
fma.rn.f32 f295, f233, f283, f294;
mul.f32 f296, f231, f295;
fma.rn.f32 f297, f293, f230, f296;
mul.f32 f298, f230, f295;
mul.f32 f299, f293, f231;
sub.f32 f300, f299, f298;
shl.b32 r15, r5, 2;
and.b32 r16, r15, 28;
add.s32 r17, r10, r16;
barrier.sync 0;
and.b32 r18, r8, 3840;
add.s32 r19, r17, r18;
st.shared.f32 [r19], f216;
st.shared.f32 [r19+32], f237;
st.shared.f32 [r19+64], f247;
st.shared.f32 [r19+96], f257;
st.shared.f32 [r19+128], f267;
st.shared.f32 [r19+160], f277;
st.shared.f32 [r19+192], f287;
st.shared.f32 [r19+224], f297;
barrier.sync 0;
mad.lo.s32 r20, r14, -28, r19;
ld.shared.f32 f301, [r20];
ld.shared.f32 f302, [r20+512];
ld.shared.f32 f303, [r20+1024];
ld.shared.f32 f304, [r20+1536];
ld.shared.f32 f305, [r20+2048];
ld.shared.f32 f306, [r20+2560];
ld.shared.f32 f307, [r20+3072];
ld.shared.f32 f308, [r20+3584];
barrier.sync 0;
st.shared.f32 [r19], f217;
st.shared.f32 [r19+32], f240;
st.shared.f32 [r19+64], f250;
st.shared.f32 [r19+96], f260;
st.shared.f32 [r19+128], f270;
st.shared.f32 [r19+160], f280;
st.shared.f32 [r19+192], f290;
st.shared.f32 [r19+224], f300;
barrier.sync 0;
ld.shared.f32 f309, [r20];
ld.shared.f32 f310, [r20+512];
ld.shared.f32 f311, [r20+1024];
ld.shared.f32 f312, [r20+1536];
ld.shared.f32 f313, [r20+2048];
ld.shared.f32 f314, [r20+2560];
ld.shared.f32 f315, [r20+3072];
ld.shared.f32 f316, [r20+3584];
add.f32 f317, f301, f305;
add.f32 f318, f309, f313;
sub.f32 f319, f301, f305;
sub.f32 f320, f309, f313;
add.f32 f321, f303, f307;
add.f32 f322, f311, f315;
sub.f32 f323, f303, f307;
sub.f32 f324, f311, f315;
add.f32 f325, f317, f321;
add.f32 f326, f318, f322;
sub.f32 f327, f317, f321;
sub.f32 f328, f318, f322;
sub.f32 f329, f319, f324;
add.f32 f330, f320, f323;
add.f32 f331, f319, f324;
sub.f32 f332, f320, f323;
add.f32 f333, f302, f306;
add.f32 f334, f310, f314;
sub.f32 f335, f302, f306;
sub.f32 f336, f310, f314;
add.f32 f337, f304, f308;
add.f32 f338, f312, f316;
sub.f32 f339, f304, f308;
sub.f32 f340, f312, f316;
add.f32 f341, f333, f337;
add.f32 f342, f334, f338;
sub.f32 f343, f333, f337;
sub.f32 f344, f334, f338;
sub.f32 f345, f335, f340;
add.f32 f346, f336, f339;
add.f32 f347, f335, f340;
sub.f32 f348, f336, f339;
mul.f32 f349, f345, 0f3F3504F3;
mul.f32 f350, f346, 0f3F3504F3;
sub.f32 f351, f349, f350;
add.f32 f352, f349, f350;
mul.f32 f353, f347, 0fBF3504F3;
mul.f32 f354, f348, 0f3F3504F3;
sub.f32 f355, f353, f354;
mul.f32 f356, f348, 0fBF3504F3;
fma.rn.f32 f357, f347, 0f3F3504F3, f356;
add.f32 f358, f325, f341;
add.f32 f359, f326, f342;
sub.f32 f360, f325, f341;
sub.f32 f361, f326, f342;
add.f32 f362, f329, f351;
add.f32 f363, f330, f352;
sub.f32 f364, f329, f351;
sub.f32 f365, f330, f352;
sub.f32 f366, f327, f344;
add.f32 f367, f328, f343;
add.f32 f368, f327, f344;
sub.f32 f369, f328, f343;
add.f32 f370, f331, f355;
add.f32 f371, f332, f357;
sub.f32 f372, f331, f355;
sub.f32 f373, f332, f357;
and.b32 r21, r5, 64;
bfe.u32 r22, r5, 6, 1;
mul.wide.u32 rd9, r22, 8;
mov.u64 rd10, %19;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f374, f375}, [rd11];
mul.f32 f378, f363, f375;
fma.rn.f32 f379, f374, f362, f378;
mul.f32 f380, f362, f375;
mul.f32 f381, f374, f363;
sub.f32 f382, f381, f380;
mul.f32 f383, f374, f374;
mul.f32 f384, f375, f375;
sub.f32 f385, f383, f384;
mul.f32 f386, f375, f374;
fma.rn.f32 f387, f375, f374, f386;
mul.f32 f388, f367, f387;
fma.rn.f32 f389, f385, f366, f388;
mul.f32 f390, f366, f387;
mul.f32 f391, f385, f367;
sub.f32 f392, f391, f390;
mul.f32 f393, f374, f385;
mul.f32 f394, f375, f387;
sub.f32 f395, f393, f394;
mul.f32 f396, f374, f387;
fma.rn.f32 f397, f375, f385, f396;
mul.f32 f398, f371, f397;
fma.rn.f32 f399, f395, f370, f398;
mul.f32 f400, f370, f397;
mul.f32 f401, f395, f371;
sub.f32 f402, f401, f400;
mul.f32 f403, f374, f395;
mul.f32 f404, f375, f397;
sub.f32 f405, f403, f404;
mul.f32 f406, f374, f397;
fma.rn.f32 f407, f375, f395, f406;
mul.f32 f408, f361, f407;
fma.rn.f32 f409, f405, f360, f408;
mul.f32 f410, f360, f407;
mul.f32 f411, f405, f361;
sub.f32 f412, f411, f410;
mul.f32 f413, f374, f405;
mul.f32 f414, f375, f407;
sub.f32 f415, f413, f414;
mul.f32 f416, f374, f407;
fma.rn.f32 f417, f375, f405, f416;
mul.f32 f418, f365, f417;
fma.rn.f32 f419, f415, f364, f418;
mul.f32 f420, f364, f417;
mul.f32 f421, f415, f365;
sub.f32 f422, f421, f420;
mul.f32 f423, f374, f415;
mul.f32 f424, f375, f417;
sub.f32 f425, f423, f424;
mul.f32 f426, f374, f417;
fma.rn.f32 f427, f375, f415, f426;
mul.f32 f428, f369, f427;
fma.rn.f32 f429, f425, f368, f428;
mul.f32 f430, f368, f427;
mul.f32 f431, f425, f369;
sub.f32 f432, f431, f430;
mul.f32 f433, f374, f425;
mul.f32 f434, f375, f427;
sub.f32 f435, f433, f434;
mul.f32 f436, f374, f427;
fma.rn.f32 f437, f375, f425, f436;
mul.f32 f438, f373, f437;
fma.rn.f32 f439, f435, f372, f438;
mul.f32 f440, f372, f437;
mul.f32 f441, f435, f373;
sub.f32 f442, f441, f440;
and.b32 r23, r15, 252;
add.s32 r24, r10, r23;
barrier.sync 0;
and.b32 r25, r8, 2048;
add.s32 r26, r24, r25;
st.shared.f32 [r26], f358;
st.shared.f32 [r26+256], f379;
st.shared.f32 [r26+512], f389;
st.shared.f32 [r26+768], f399;
st.shared.f32 [r26+1024], f409;
st.shared.f32 [r26+1280], f419;
st.shared.f32 [r26+1536], f429;
st.shared.f32 [r26+1792], f439;
barrier.sync 0;
mad.lo.s32 r27, r21, -28, r26;
ld.shared.f32 f443, [r27];
ld.shared.f32 f444, [r27+512];
ld.shared.f32 f445, [r27+1024];
ld.shared.f32 f446, [r27+1536];
ld.shared.f32 f447, [r27+2048];
ld.shared.f32 f448, [r27+2560];
ld.shared.f32 f449, [r27+3072];
ld.shared.f32 f450, [r27+3584];
barrier.sync 0;
st.shared.f32 [r26], f359;
st.shared.f32 [r26+256], f382;
st.shared.f32 [r26+512], f392;
st.shared.f32 [r26+768], f402;
st.shared.f32 [r26+1024], f412;
st.shared.f32 [r26+1280], f422;
st.shared.f32 [r26+1536], f432;
st.shared.f32 [r26+1792], f442;
barrier.sync 0;
ld.shared.f32 f451, [r27];
ld.shared.f32 f452, [r27+512];
ld.shared.f32 f453, [r27+1024];
ld.shared.f32 f454, [r27+1536];
ld.shared.f32 f455, [r27+2048];
ld.shared.f32 f456, [r27+2560];
ld.shared.f32 f457, [r27+3072];
ld.shared.f32 f458, [r27+3584];
add.f32 %0, f443, f447;
add.f32 %1, f451, f455;
add.f32 %2, f444, f448;
add.f32 %3, f452, f456;
add.f32 %4, f445, f449;
add.f32 %5, f453, f457;
add.f32 %6, f446, f450;
add.f32 %7, f454, f458;
sub.f32 %8, f443, f447;
sub.f32 %9, f451, f455;
sub.f32 %10, f444, f448;
sub.f32 %11, f452, f456;
sub.f32 %12, f445, f449;
sub.f32 %13, f453, f457;
sub.f32 %14, f446, f450;
sub.f32 %15, f454, f458;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_1024), "l"(lut_sp_8_128), "l"(lut_sp_8_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<287, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<1778>;
.reg .b32 r<18>;
.reg .b64 rd<9>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 12;
mov.u32 r3, %64;
add.s32 r4, r3, r2;
add.f32 f129, %66, %98;
sub.f32 f131, %66, %98;
add.f32 f1776, %67, %130;
sub.f32 f132, %67, %130;
add.f32 f133, %82, %114;
sub.f32 f135, %82, %114;
add.f32 f1774, %131, %115;
sub.f32 f136, %131, %115;
add.f32 f137, f129, f133;
sub.f32 f139, f129, f133;
add.f32 f1773, f1776, f1774;
sub.f32 f140, f1776, f1774;
sub.f32 f141, f131, f136;
add.f32 f143, f131, f136;
add.f32 f1772, f132, f135;
sub.f32 f144, f132, f135;
add.f32 f145, %74, %106;
sub.f32 f147, %74, %106;
add.f32 f1769, %132, %133;
sub.f32 f148, %132, %133;
add.f32 f149, %90, %122;
sub.f32 f151, %90, %122;
add.f32 f1767, %91, %134;
sub.f32 f152, %91, %134;
add.f32 f153, f145, f149;
sub.f32 f155, f145, f149;
add.f32 f1766, f1769, f1767;
sub.f32 f156, f1769, f1767;
sub.f32 f157, f147, f152;
add.f32 f159, f147, f152;
add.f32 f1765, f148, f151;
sub.f32 f160, f148, f151;
mul.f32 f161, f157, 0f3F3504F3;
mul.f32 f162, f1765, 0f3F3504F3;
sub.f32 f163, f161, f162;
add.f32 f164, f161, f162;
mul.f32 f1763, f159, 0fBF3504F3;
mul.f32 f1764, f160, 0f3F3504F3;
sub.f32 f167, f1763, f1764;
mul.f32 f168, f160, 0fBF3504F3;
fma.rn.f32 f169, f159, 0f3F3504F3, f168;
add.f32 f170, f137, f153;
sub.f32 f172, f137, f153;
add.f32 f1762, f1773, f1766;
sub.f32 f173, f1773, f1766;
add.f32 f174, f141, f163;
sub.f32 f176, f141, f163;
add.f32 f1761, f1772, f164;
sub.f32 f177, f1772, f164;
sub.f32 f178, f139, f156;
add.f32 f180, f139, f156;
add.f32 f1760, f140, f155;
sub.f32 f181, f140, f155;
add.f32 f182, f143, f167;
sub.f32 f184, f143, f167;
add.f32 f1759, f144, f169;
sub.f32 f185, f144, f169;
add.f32 f186, %70, %102;
sub.f32 f188, %70, %102;
add.f32 f1757, %135, %103;
sub.f32 f189, %135, %103;
add.f32 f190, %86, %118;
sub.f32 f192, %86, %118;
add.f32 f1754, %137, %136;
sub.f32 f193, %137, %136;
add.f32 f194, f186, f190;
sub.f32 f196, f186, f190;
add.f32 f1753, f1757, f1754;
sub.f32 f197, f1757, f1754;
sub.f32 f198, f188, f193;
add.f32 f200, f188, f193;
add.f32 f1752, f189, f192;
sub.f32 f201, f189, f192;
add.f32 f202, %78, %110;
sub.f32 f204, %78, %110;
add.f32 f1750, %79, %138;
sub.f32 f205, %79, %138;
add.f32 f206, %94, %126;
sub.f32 f208, %94, %126;
add.f32 f1748, %139, %127;
sub.f32 f209, %139, %127;
add.f32 f210, f202, f206;
sub.f32 f212, f202, f206;
add.f32 f1747, f1750, f1748;
sub.f32 f213, f1750, f1748;
sub.f32 f214, f204, f209;
add.f32 f216, f204, f209;
add.f32 f1746, f205, f208;
sub.f32 f217, f205, f208;
mul.f32 f218, f214, 0f3F3504F3;
mul.f32 f219, f1746, 0f3F3504F3;
sub.f32 f220, f218, f219;
add.f32 f221, f218, f219;
mul.f32 f1744, f216, 0fBF3504F3;
mul.f32 f1745, f217, 0f3F3504F3;
sub.f32 f224, f1744, f1745;
mul.f32 f225, f217, 0fBF3504F3;
fma.rn.f32 f226, f216, 0f3F3504F3, f225;
add.f32 f227, f194, f210;
sub.f32 f229, f194, f210;
add.f32 f1743, f1753, f1747;
sub.f32 f230, f1753, f1747;
add.f32 f231, f198, f220;
sub.f32 f233, f198, f220;
add.f32 f1742, f1752, f221;
sub.f32 f234, f1752, f221;
sub.f32 f235, f196, f213;
add.f32 f237, f196, f213;
add.f32 f1741, f197, f212;
sub.f32 f238, f197, f212;
add.f32 f239, f200, f224;
sub.f32 f241, f200, f224;
add.f32 f1740, f201, f226;
sub.f32 f242, f201, f226;
mul.f32 f1738, f231, 0f3F6C835E;
mul.f32 f1739, f1742, 0f3EC3EF15;
sub.f32 f245, f1738, f1739;
mul.f32 f246, f1742, 0f3F6C835E;
fma.rn.f32 f247, f231, 0f3EC3EF15, f246;
mul.f32 f248, f235, 0f3F3504F3;
mul.f32 f249, f1741, 0f3F3504F3;
sub.f32 f250, f248, f249;
add.f32 f251, f248, f249;
mul.f32 f253, f1740, 0f3F6C835E;
mul.f32 f1737, f239, 0f3EC3EF15;
sub.f32 f254, f1737, f253;
mul.f32 f255, f1740, 0f3EC3EF15;
fma.rn.f32 f256, f239, 0f3F6C835E, f255;
mul.f32 f258, f234, 0f3F6C835E;
mul.f32 f1736, f233, 0fBEC3EF15;
sub.f32 f259, f1736, f258;
mul.f32 f260, f234, 0fBEC3EF15;
fma.rn.f32 f261, f233, 0f3F6C835E, f260;
mul.f32 f1734, f237, 0fBF3504F3;
mul.f32 f1735, f238, 0f3F3504F3;
sub.f32 f264, f1734, f1735;
mul.f32 f265, f238, 0fBF3504F3;
fma.rn.f32 f266, f237, 0f3F3504F3, f265;
mul.f32 f1732, f241, 0fBF6C835E;
mul.f32 f1733, f242, 0f3EC3EF15;
sub.f32 f269, f1732, f1733;
mul.f32 f270, f242, 0fBF6C835E;
fma.rn.f32 f271, f241, 0f3EC3EF15, f270;
add.f32 f272, f170, f227;
sub.f32 f274, f170, f227;
add.f32 f1731, f1762, f1743;
sub.f32 f275, f1762, f1743;
add.f32 f276, f174, f245;
sub.f32 f278, f174, f245;
add.f32 f1730, f1761, f247;
sub.f32 f279, f1761, f247;
add.f32 f280, f178, f250;
sub.f32 f282, f178, f250;
add.f32 f1729, f1760, f251;
sub.f32 f283, f1760, f251;
add.f32 f284, f182, f254;
sub.f32 f286, f182, f254;
add.f32 f1728, f1759, f256;
sub.f32 f287, f1759, f256;
sub.f32 f288, f172, f230;
add.f32 f290, f172, f230;
add.f32 f1727, f173, f229;
sub.f32 f291, f173, f229;
add.f32 f292, f176, f259;
sub.f32 f294, f176, f259;
add.f32 f1726, f177, f261;
sub.f32 f295, f177, f261;
add.f32 f296, f180, f264;
sub.f32 f298, f180, f264;
add.f32 f1725, f181, f266;
sub.f32 f299, f181, f266;
add.f32 f300, f184, f269;
sub.f32 f302, f184, f269;
add.f32 f1724, f185, f271;
sub.f32 f303, f185, f271;
add.f32 f304, %68, %100;
sub.f32 f306, %68, %100;
add.f32 f1721, %141, %140;
sub.f32 f307, %141, %140;
add.f32 f308, %84, %116;
sub.f32 f310, %84, %116;
add.f32 f1719, %85, %142;
sub.f32 f311, %85, %142;
add.f32 f312, f304, f308;
sub.f32 f314, f304, f308;
add.f32 f1718, f1721, f1719;
sub.f32 f315, f1721, f1719;
sub.f32 f316, f306, f311;
add.f32 f318, f306, f311;
add.f32 f1717, f307, f310;
sub.f32 f319, f307, f310;
add.f32 f320, %76, %108;
sub.f32 f322, %76, %108;
add.f32 f1715, %143, %109;
sub.f32 f323, %143, %109;
add.f32 f324, %92, %124;
sub.f32 f326, %92, %124;
add.f32 f1712, %145, %144;
sub.f32 f327, %145, %144;
add.f32 f328, f320, f324;
sub.f32 f330, f320, f324;
add.f32 f1711, f1715, f1712;
sub.f32 f331, f1715, f1712;
sub.f32 f332, f322, f327;
add.f32 f334, f322, f327;
add.f32 f1710, f323, f326;
sub.f32 f335, f323, f326;
mul.f32 f336, f332, 0f3F3504F3;
mul.f32 f337, f1710, 0f3F3504F3;
sub.f32 f338, f336, f337;
add.f32 f339, f336, f337;
mul.f32 f341, f335, 0f3F3504F3;
mul.f32 f1709, f334, 0fBF3504F3;
sub.f32 f342, f1709, f341;
mul.f32 f343, f335, 0fBF3504F3;
fma.rn.f32 f344, f334, 0f3F3504F3, f343;
add.f32 f345, f312, f328;
sub.f32 f347, f312, f328;
add.f32 f1708, f1718, f1711;
sub.f32 f348, f1718, f1711;
add.f32 f349, f316, f338;
sub.f32 f351, f316, f338;
add.f32 f1707, f1717, f339;
sub.f32 f352, f1717, f339;
sub.f32 f353, f314, f331;
add.f32 f355, f314, f331;
add.f32 f1706, f315, f330;
sub.f32 f356, f315, f330;
add.f32 f357, f318, f342;
sub.f32 f359, f318, f342;
add.f32 f1705, f319, f344;
sub.f32 f360, f319, f344;
add.f32 f361, %72, %104;
sub.f32 f363, %72, %104;
add.f32 f1703, %73, %146;
sub.f32 f364, %73, %146;
add.f32 f365, %88, %120;
sub.f32 f367, %88, %120;
add.f32 f1701, %147, %121;
sub.f32 f368, %147, %121;
add.f32 f369, f361, f365;
sub.f32 f371, f361, f365;
add.f32 f1700, f1703, f1701;
sub.f32 f372, f1703, f1701;
sub.f32 f373, f363, f368;
add.f32 f375, f363, f368;
add.f32 f1699, f364, f367;
sub.f32 f376, f364, f367;
add.f32 f377, %80, %112;
sub.f32 f379, %80, %112;
add.f32 f1696, %148, %149;
sub.f32 f380, %148, %149;
add.f32 f381, %96, %128;
sub.f32 f383, %96, %128;
add.f32 f1695, %97, %129;
sub.f32 f384, %97, %129;
add.f32 f385, f377, f381;
sub.f32 f387, f377, f381;
add.f32 f1694, f1696, f1695;
sub.f32 f388, f1696, f1695;
sub.f32 f389, f379, f384;
add.f32 f391, f379, f384;
add.f32 f1693, f380, f383;
sub.f32 f392, f380, f383;
mul.f32 f393, f389, 0f3F3504F3;
mul.f32 f394, f1693, 0f3F3504F3;
sub.f32 f395, f393, f394;
add.f32 f396, f393, f394;
mul.f32 f1691, f391, 0fBF3504F3;
mul.f32 f1692, f392, 0f3F3504F3;
sub.f32 f399, f1691, f1692;
mul.f32 f400, f392, 0fBF3504F3;
fma.rn.f32 f401, f391, 0f3F3504F3, f400;
add.f32 f402, f369, f385;
sub.f32 f404, f369, f385;
add.f32 f1690, f1700, f1694;
sub.f32 f405, f1700, f1694;
add.f32 f406, f373, f395;
sub.f32 f408, f373, f395;
add.f32 f1689, f1699, f396;
sub.f32 f409, f1699, f396;
sub.f32 f410, f371, f388;
add.f32 f412, f371, f388;
add.f32 f1688, f372, f387;
sub.f32 f413, f372, f387;
add.f32 f414, f375, f399;
sub.f32 f416, f375, f399;
add.f32 f1687, f376, f401;
sub.f32 f417, f376, f401;
mul.f32 f419, f1689, 0f3EC3EF15;
mul.f32 f1686, f406, 0f3F6C835E;
sub.f32 f420, f1686, f419;
mul.f32 f421, f1689, 0f3F6C835E;
fma.rn.f32 f422, f406, 0f3EC3EF15, f421;
mul.f32 f423, f410, 0f3F3504F3;
mul.f32 f424, f1688, 0f3F3504F3;
sub.f32 f425, f423, f424;
add.f32 f426, f423, f424;
mul.f32 f428, f1687, 0f3F6C835E;
mul.f32 f1685, f414, 0f3EC3EF15;
sub.f32 f429, f1685, f428;
mul.f32 f430, f1687, 0f3EC3EF15;
fma.rn.f32 f431, f414, 0f3F6C835E, f430;
mul.f32 f433, f409, 0f3F6C835E;
mul.f32 f1684, f408, 0fBEC3EF15;
sub.f32 f434, f1684, f433;
mul.f32 f435, f409, 0fBEC3EF15;
fma.rn.f32 f436, f408, 0f3F6C835E, f435;
mul.f32 f438, f413, 0f3F3504F3;
mul.f32 f1683, f412, 0fBF3504F3;
sub.f32 f439, f1683, f438;
mul.f32 f440, f413, 0fBF3504F3;
fma.rn.f32 f441, f412, 0f3F3504F3, f440;
mul.f32 f443, f417, 0f3EC3EF15;
mul.f32 f1682, f416, 0fBF6C835E;
sub.f32 f444, f1682, f443;
mul.f32 f445, f417, 0fBF6C835E;
fma.rn.f32 f446, f416, 0f3EC3EF15, f445;
add.f32 f447, f345, f402;
sub.f32 f449, f345, f402;
add.f32 f1681, f1708, f1690;
sub.f32 f450, f1708, f1690;
add.f32 f451, f349, f420;
sub.f32 f453, f349, f420;
add.f32 f1680, f1707, f422;
sub.f32 f454, f1707, f422;
add.f32 f455, f353, f425;
sub.f32 f457, f353, f425;
add.f32 f1679, f1706, f426;
sub.f32 f458, f1706, f426;
add.f32 f459, f357, f429;
sub.f32 f461, f357, f429;
add.f32 f1678, f1705, f431;
sub.f32 f462, f1705, f431;
sub.f32 f463, f347, f405;
add.f32 f465, f347, f405;
add.f32 f1677, f348, f404;
sub.f32 f466, f348, f404;
add.f32 f467, f351, f434;
sub.f32 f469, f351, f434;
add.f32 f1676, f352, f436;
sub.f32 f470, f352, f436;
add.f32 f471, f355, f439;
sub.f32 f473, f355, f439;
add.f32 f1675, f356, f441;
sub.f32 f474, f356, f441;
add.f32 f475, f359, f444;
sub.f32 f477, f359, f444;
add.f32 f1674, f360, f446;
sub.f32 f478, f360, f446;
mul.f32 f480, f1680, 0f3E47C5C2;
mul.f32 f1673, f451, 0f3F7B14BE;
sub.f32 f481, f1673, f480;
mul.f32 f482, f1680, 0f3F7B14BE;
fma.rn.f32 f483, f451, 0f3E47C5C2, f482;
mul.f32 f485, f1679, 0f3EC3EF15;
mul.f32 f1672, f455, 0f3F6C835E;
sub.f32 f486, f1672, f485;
mul.f32 f487, f1679, 0f3F6C835E;
fma.rn.f32 f488, f455, 0f3EC3EF15, f487;
mul.f32 f490, f1678, 0f3F0E39DA;
mul.f32 f1671, f459, 0f3F54DB31;
sub.f32 f491, f1671, f490;
mul.f32 f492, f1678, 0f3F54DB31;
fma.rn.f32 f493, f459, 0f3F0E39DA, f492;
mul.f32 f494, f463, 0f3F3504F3;
mul.f32 f495, f1677, 0f3F3504F3;
sub.f32 f496, f494, f495;
add.f32 f497, f494, f495;
mul.f32 f499, f1676, 0f3F54DB31;
mul.f32 f1670, f467, 0f3F0E39DA;
sub.f32 f500, f1670, f499;
mul.f32 f501, f1676, 0f3F0E39DA;
fma.rn.f32 f502, f467, 0f3F54DB31, f501;
mul.f32 f504, f1675, 0f3F6C835E;
mul.f32 f1669, f471, 0f3EC3EF15;
sub.f32 f505, f1669, f504;
mul.f32 f506, f1675, 0f3EC3EF15;
fma.rn.f32 f507, f471, 0f3F6C835E, f506;
mul.f32 f509, f1674, 0f3F7B14BE;
mul.f32 f1668, f475, 0f3E47C5C2;
sub.f32 f510, f1668, f509;
mul.f32 f511, f1674, 0f3E47C5C2;
fma.rn.f32 f512, f475, 0f3F7B14BE, f511;
mul.f32 f514, f454, 0f3F7B14BE;
mul.f32 f1667, f453, 0fBE47C5C2;
sub.f32 f515, f1667, f514;
mul.f32 f516, f454, 0fBE47C5C2;
fma.rn.f32 f517, f453, 0f3F7B14BE, f516;
mul.f32 f1665, f457, 0fBEC3EF15;
mul.f32 f1666, f458, 0f3F6C835E;
sub.f32 f520, f1665, f1666;
mul.f32 f521, f458, 0fBEC3EF15;
fma.rn.f32 f522, f457, 0f3F6C835E, f521;
mul.f32 f1663, f461, 0fBF0E39DA;
mul.f32 f1664, f462, 0f3F54DB31;
sub.f32 f525, f1663, f1664;
mul.f32 f526, f462, 0fBF0E39DA;
fma.rn.f32 f527, f461, 0f3F54DB31, f526;
mul.f32 f1661, f465, 0fBF3504F3;
mul.f32 f1662, f466, 0f3F3504F3;
sub.f32 f530, f1661, f1662;
mul.f32 f531, f466, 0fBF3504F3;
fma.rn.f32 f532, f465, 0f3F3504F3, f531;
mul.f32 f1659, f469, 0fBF54DB31;
mul.f32 f1660, f470, 0f3F0E39DA;
sub.f32 f535, f1659, f1660;
mul.f32 f536, f470, 0fBF54DB31;
fma.rn.f32 f537, f469, 0f3F0E39DA, f536;
mul.f32 f539, f474, 0f3EC3EF15;
mul.f32 f1658, f473, 0fBF6C835E;
sub.f32 f540, f1658, f539;
mul.f32 f541, f474, 0fBF6C835E;
fma.rn.f32 f542, f473, 0f3EC3EF15, f541;
mul.f32 f544, f478, 0f3E47C5C2;
mul.f32 f1657, f477, 0fBF7B14BE;
sub.f32 f545, f1657, f544;
mul.f32 f546, f478, 0fBF7B14BE;
fma.rn.f32 f547, f477, 0f3E47C5C2, f546;
add.f32 f548, f272, f447;
sub.f32 f550, f272, f447;
add.f32 f1656, f1731, f1681;
sub.f32 f551, f1731, f1681;
add.f32 f552, f276, f481;
sub.f32 f554, f276, f481;
add.f32 f1655, f1730, f483;
sub.f32 f555, f1730, f483;
add.f32 f556, f280, f486;
sub.f32 f558, f280, f486;
add.f32 f1654, f1729, f488;
sub.f32 f559, f1729, f488;
add.f32 f560, f284, f491;
sub.f32 f562, f284, f491;
add.f32 f1653, f1728, f493;
sub.f32 f563, f1728, f493;
add.f32 f564, f288, f496;
sub.f32 f566, f288, f496;
add.f32 f1652, f1727, f497;
sub.f32 f567, f1727, f497;
add.f32 f568, f292, f500;
sub.f32 f570, f292, f500;
add.f32 f1651, f1726, f502;
sub.f32 f571, f1726, f502;
add.f32 f572, f296, f505;
sub.f32 f574, f296, f505;
add.f32 f1650, f1725, f507;
sub.f32 f575, f1725, f507;
add.f32 f576, f300, f510;
sub.f32 f578, f300, f510;
add.f32 f1649, f1724, f512;
sub.f32 f579, f1724, f512;
sub.f32 f580, f274, f450;
add.f32 f582, f274, f450;
add.f32 f1648, f275, f449;
sub.f32 f583, f275, f449;
add.f32 f584, f278, f515;
sub.f32 f586, f278, f515;
add.f32 f1647, f279, f517;
sub.f32 f587, f279, f517;
add.f32 f588, f282, f520;
sub.f32 f590, f282, f520;
add.f32 f1646, f283, f522;
sub.f32 f591, f283, f522;
add.f32 f592, f286, f525;
sub.f32 f594, f286, f525;
add.f32 f1645, f287, f527;
sub.f32 f595, f287, f527;
add.f32 f596, f290, f530;
sub.f32 f598, f290, f530;
add.f32 f1644, f291, f532;
sub.f32 f599, f291, f532;
add.f32 f600, f294, f535;
sub.f32 f602, f294, f535;
add.f32 f1643, f295, f537;
sub.f32 f603, f295, f537;
add.f32 f604, f298, f540;
sub.f32 f606, f298, f540;
add.f32 f1642, f299, f542;
sub.f32 f607, f299, f542;
add.f32 f608, f302, f545;
sub.f32 f610, f302, f545;
add.f32 f1641, f303, f547;
sub.f32 f611, f303, f547;
mov.u32 r15, %tid.x;
shl.b32 r7, r15, 3;
cvt.u64.u32 rd2, r7;
and.b64 rd3, rd2, 248;
mov.u64 rd4, %65;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f612, f613}, [rd5];
mul.f32 f616, f1655, f613;
fma.rn.f32 f617, f612, f552, f616;
mul.f32 f618, f552, f613;
mul.f32 f619, f612, f1655;
sub.f32 f620, f619, f618;
mul.f32 f1639, f612, f612;
mul.f32 f1640, f613, f613;
sub.f32 f623, f1639, f1640;
mul.f32 f624, f613, f612;
fma.rn.f32 f625, f613, f612, f624;
mul.f32 f626, f1654, f625;
fma.rn.f32 f627, f623, f556, f626;
mul.f32 f628, f556, f625;
mul.f32 f629, f623, f1654;
sub.f32 f630, f629, f628;
mul.f32 f1637, f612, f623;
mul.f32 f1638, f613, f625;
sub.f32 f633, f1637, f1638;
mul.f32 f634, f612, f625;
fma.rn.f32 f635, f613, f623, f634;
mul.f32 f636, f1653, f635;
fma.rn.f32 f637, f633, f560, f636;
mul.f32 f638, f560, f635;
mul.f32 f639, f633, f1653;
sub.f32 f640, f639, f638;
mul.f32 f642, f613, f635;
mul.f32 f1636, f612, f633;
sub.f32 f643, f1636, f642;
mul.f32 f644, f612, f635;
fma.rn.f32 f645, f613, f633, f644;
mul.f32 f646, f1652, f645;
fma.rn.f32 f647, f643, f564, f646;
mul.f32 f648, f564, f645;
mul.f32 f649, f643, f1652;
sub.f32 f650, f649, f648;
mul.f32 f652, f613, f645;
mul.f32 f1635, f612, f643;
sub.f32 f653, f1635, f652;
mul.f32 f654, f612, f645;
fma.rn.f32 f655, f613, f643, f654;
mul.f32 f656, f1651, f655;
fma.rn.f32 f657, f653, f568, f656;
mul.f32 f658, f568, f655;
mul.f32 f659, f653, f1651;
sub.f32 f660, f659, f658;
mul.f32 f662, f613, f655;
mul.f32 f1634, f612, f653;
sub.f32 f663, f1634, f662;
mul.f32 f664, f612, f655;
fma.rn.f32 f665, f613, f653, f664;
mul.f32 f666, f1650, f665;
fma.rn.f32 f667, f663, f572, f666;
mul.f32 f668, f572, f665;
mul.f32 f669, f663, f1650;
sub.f32 f670, f669, f668;
mul.f32 f1632, f612, f663;
mul.f32 f1633, f613, f665;
sub.f32 f673, f1632, f1633;
mul.f32 f674, f612, f665;
fma.rn.f32 f675, f613, f663, f674;
mul.f32 f676, f1649, f675;
fma.rn.f32 f677, f673, f576, f676;
mul.f32 f678, f576, f675;
mul.f32 f679, f673, f1649;
sub.f32 f680, f679, f678;
mul.f32 f1630, f612, f673;
mul.f32 f1631, f613, f675;
sub.f32 f683, f1630, f1631;
mul.f32 f684, f612, f675;
fma.rn.f32 f685, f613, f673, f684;
mul.f32 f686, f1648, f685;
fma.rn.f32 f687, f683, f580, f686;
mul.f32 f688, f580, f685;
mul.f32 f689, f683, f1648;
sub.f32 f690, f689, f688;
mul.f32 f692, f613, f685;
mul.f32 f1629, f612, f683;
sub.f32 f693, f1629, f692;
mul.f32 f694, f612, f685;
fma.rn.f32 f695, f613, f683, f694;
mul.f32 f696, f1647, f695;
fma.rn.f32 f697, f693, f584, f696;
mul.f32 f698, f584, f695;
mul.f32 f699, f693, f1647;
sub.f32 f700, f699, f698;
mul.f32 f702, f613, f695;
mul.f32 f1628, f612, f693;
sub.f32 f703, f1628, f702;
mul.f32 f704, f612, f695;
fma.rn.f32 f705, f613, f693, f704;
mul.f32 f706, f1646, f705;
fma.rn.f32 f707, f703, f588, f706;
mul.f32 f708, f588, f705;
mul.f32 f709, f703, f1646;
sub.f32 f710, f709, f708;
mul.f32 f1626, f612, f703;
mul.f32 f1627, f613, f705;
sub.f32 f713, f1626, f1627;
mul.f32 f714, f612, f705;
fma.rn.f32 f715, f613, f703, f714;
mul.f32 f716, f1645, f715;
fma.rn.f32 f717, f713, f592, f716;
mul.f32 f718, f592, f715;
mul.f32 f719, f713, f1645;
sub.f32 f720, f719, f718;
mul.f32 f1624, f612, f713;
mul.f32 f1625, f613, f715;
sub.f32 f723, f1624, f1625;
mul.f32 f724, f612, f715;
fma.rn.f32 f725, f613, f713, f724;
mul.f32 f726, f1644, f725;
fma.rn.f32 f727, f723, f596, f726;
mul.f32 f728, f596, f725;
mul.f32 f729, f723, f1644;
sub.f32 f730, f729, f728;
mul.f32 f732, f613, f725;
mul.f32 f1623, f612, f723;
sub.f32 f733, f1623, f732;
mul.f32 f734, f612, f725;
fma.rn.f32 f735, f613, f723, f734;
mul.f32 f736, f1643, f735;
fma.rn.f32 f737, f733, f600, f736;
mul.f32 f738, f600, f735;
mul.f32 f739, f733, f1643;
sub.f32 f740, f739, f738;
mul.f32 f742, f613, f735;
mul.f32 f1622, f612, f733;
sub.f32 f743, f1622, f742;
mul.f32 f744, f612, f735;
fma.rn.f32 f745, f613, f733, f744;
mul.f32 f746, f1642, f745;
fma.rn.f32 f747, f743, f604, f746;
mul.f32 f748, f604, f745;
mul.f32 f749, f743, f1642;
sub.f32 f750, f749, f748;
mul.f32 f752, f613, f745;
mul.f32 f1621, f612, f743;
sub.f32 f753, f1621, f752;
mul.f32 f754, f612, f745;
fma.rn.f32 f755, f613, f743, f754;
mul.f32 f756, f1641, f755;
fma.rn.f32 f757, f753, f608, f756;
mul.f32 f758, f608, f755;
mul.f32 f759, f753, f1641;
sub.f32 f760, f759, f758;
mul.f32 f1619, f612, f753;
mul.f32 f1620, f613, f755;
sub.f32 f763, f1619, f1620;
mul.f32 f764, f612, f755;
fma.rn.f32 f765, f613, f753, f764;
mul.f32 f766, f551, f765;
fma.rn.f32 f767, f763, f550, f766;
mul.f32 f768, f550, f765;
mul.f32 f769, f763, f551;
sub.f32 f770, f769, f768;
mul.f32 f1617, f612, f763;
mul.f32 f1618, f613, f765;
sub.f32 f773, f1617, f1618;
mul.f32 f774, f612, f765;
fma.rn.f32 f775, f613, f763, f774;
mul.f32 f776, f555, f775;
fma.rn.f32 f777, f773, f554, f776;
mul.f32 f778, f554, f775;
mul.f32 f779, f773, f555;
sub.f32 f780, f779, f778;
mul.f32 f782, f613, f775;
mul.f32 f1616, f612, f773;
sub.f32 f783, f1616, f782;
mul.f32 f784, f612, f775;
fma.rn.f32 f785, f613, f773, f784;
mul.f32 f786, f559, f785;
fma.rn.f32 f787, f783, f558, f786;
mul.f32 f788, f558, f785;
mul.f32 f789, f783, f559;
sub.f32 f790, f789, f788;
mul.f32 f792, f613, f785;
mul.f32 f1615, f612, f783;
sub.f32 f793, f1615, f792;
mul.f32 f794, f612, f785;
fma.rn.f32 f795, f613, f783, f794;
mul.f32 f796, f563, f795;
fma.rn.f32 f797, f793, f562, f796;
mul.f32 f798, f562, f795;
mul.f32 f799, f793, f563;
sub.f32 f800, f799, f798;
mul.f32 f802, f613, f795;
mul.f32 f1614, f612, f793;
sub.f32 f803, f1614, f802;
mul.f32 f804, f612, f795;
fma.rn.f32 f805, f613, f793, f804;
mul.f32 f806, f567, f805;
fma.rn.f32 f807, f803, f566, f806;
mul.f32 f808, f566, f805;
mul.f32 f809, f803, f567;
sub.f32 f810, f809, f808;
mul.f32 f1612, f612, f803;
mul.f32 f1613, f613, f805;
sub.f32 f813, f1612, f1613;
mul.f32 f814, f612, f805;
fma.rn.f32 f815, f613, f803, f814;
mul.f32 f816, f571, f815;
fma.rn.f32 f817, f813, f570, f816;
mul.f32 f818, f570, f815;
mul.f32 f819, f813, f571;
sub.f32 f820, f819, f818;
mul.f32 f1610, f612, f813;
mul.f32 f1611, f613, f815;
sub.f32 f823, f1610, f1611;
mul.f32 f824, f612, f815;
fma.rn.f32 f825, f613, f813, f824;
mul.f32 f826, f575, f825;
fma.rn.f32 f827, f823, f574, f826;
mul.f32 f828, f574, f825;
mul.f32 f829, f823, f575;
sub.f32 f830, f829, f828;
mul.f32 f832, f613, f825;
mul.f32 f1609, f612, f823;
sub.f32 f833, f1609, f832;
mul.f32 f834, f612, f825;
fma.rn.f32 f835, f613, f823, f834;
mul.f32 f836, f579, f835;
fma.rn.f32 f837, f833, f578, f836;
mul.f32 f838, f578, f835;
mul.f32 f839, f833, f579;
sub.f32 f840, f839, f838;
mul.f32 f842, f613, f835;
mul.f32 f1608, f612, f833;
sub.f32 f843, f1608, f842;
mul.f32 f844, f612, f835;
fma.rn.f32 f845, f613, f833, f844;
mul.f32 f846, f583, f845;
fma.rn.f32 f847, f843, f582, f846;
mul.f32 f848, f582, f845;
mul.f32 f849, f843, f583;
sub.f32 f850, f849, f848;
mul.f32 f1606, f612, f843;
mul.f32 f1607, f613, f845;
sub.f32 f853, f1606, f1607;
mul.f32 f854, f612, f845;
fma.rn.f32 f855, f613, f843, f854;
mul.f32 f856, f587, f855;
fma.rn.f32 f857, f853, f586, f856;
mul.f32 f858, f586, f855;
mul.f32 f859, f853, f587;
sub.f32 f860, f859, f858;
mul.f32 f1604, f612, f853;
mul.f32 f1605, f613, f855;
sub.f32 f863, f1604, f1605;
mul.f32 f864, f612, f855;
fma.rn.f32 f865, f613, f853, f864;
mul.f32 f866, f591, f865;
fma.rn.f32 f867, f863, f590, f866;
mul.f32 f868, f590, f865;
mul.f32 f869, f863, f591;
sub.f32 f870, f869, f868;
mul.f32 f872, f613, f865;
mul.f32 f1603, f612, f863;
sub.f32 f873, f1603, f872;
mul.f32 f874, f612, f865;
fma.rn.f32 f875, f613, f863, f874;
mul.f32 f876, f595, f875;
fma.rn.f32 f877, f873, f594, f876;
mul.f32 f878, f594, f875;
mul.f32 f879, f873, f595;
sub.f32 f880, f879, f878;
mul.f32 f882, f613, f875;
mul.f32 f1602, f612, f873;
sub.f32 f883, f1602, f882;
mul.f32 f884, f612, f875;
fma.rn.f32 f885, f613, f873, f884;
mul.f32 f886, f599, f885;
fma.rn.f32 f887, f883, f598, f886;
mul.f32 f888, f598, f885;
mul.f32 f889, f883, f599;
sub.f32 f890, f889, f888;
mul.f32 f892, f613, f885;
mul.f32 f1601, f612, f883;
sub.f32 f893, f1601, f892;
mul.f32 f894, f612, f885;
fma.rn.f32 f895, f613, f883, f894;
mul.f32 f896, f603, f895;
fma.rn.f32 f897, f893, f602, f896;
mul.f32 f898, f602, f895;
mul.f32 f899, f893, f603;
sub.f32 f900, f899, f898;
mul.f32 f1599, f612, f893;
mul.f32 f1600, f613, f895;
sub.f32 f903, f1599, f1600;
mul.f32 f904, f612, f895;
fma.rn.f32 f905, f613, f893, f904;
mul.f32 f906, f607, f905;
fma.rn.f32 f907, f903, f606, f906;
mul.f32 f908, f606, f905;
mul.f32 f909, f903, f607;
sub.f32 f910, f909, f908;
mul.f32 f1597, f612, f903;
mul.f32 f1598, f613, f905;
sub.f32 f913, f1597, f1598;
mul.f32 f914, f612, f905;
fma.rn.f32 f915, f613, f903, f914;
mov.u32 r17, %tid.x;
mul.f32 f916, f611, f915;
fma.rn.f32 f917, f913, f610, f916;
mul.f32 f918, f610, f915;
mul.f32 f919, f913, f611;
sub.f32 f920, f919, f918;
and.b32 r14, r17, 31;
shl.b32 r8, r17, 7;
and.b32 r9, r8, -4096;
add.s32 r10, r4, r9;
barrier.sync 0;
and.b32 r11, r8, 3968;
add.s32 r12, r10, r11;
st.shared.v4.f32 [r12], {f548, f617, f627, f637};
st.shared.v4.f32 [r12+16], {f647, f657, f667, f677};
st.shared.v4.f32 [r12+32], {f687, f697, f707, f717};
st.shared.v4.f32 [r12+48], {f727, f737, f747, f757};
st.shared.v4.f32 [r12+64], {f767, f777, f787, f797};
st.shared.v4.f32 [r12+80], {f807, f817, f827, f837};
st.shared.v4.f32 [r12+96], {f847, f857, f867, f877};
st.shared.v4.f32 [r12+112], {f887, f897, f907, f917};
barrier.sync 0;
mad.lo.s32 r13, r14, -124, r12;
ld.shared.f32 f921, [r13];
ld.shared.f32 f922, [r13+128];
ld.shared.f32 f923, [r13+256];
ld.shared.f32 f924, [r13+384];
ld.shared.f32 f925, [r13+512];
ld.shared.f32 f926, [r13+640];
ld.shared.f32 f927, [r13+768];
ld.shared.f32 f928, [r13+896];
ld.shared.f32 f929, [r13+1024];
ld.shared.f32 f930, [r13+1152];
ld.shared.f32 f931, [r13+1280];
ld.shared.f32 f932, [r13+1408];
ld.shared.f32 f933, [r13+1536];
ld.shared.f32 f934, [r13+1664];
ld.shared.f32 f935, [r13+1792];
ld.shared.f32 f936, [r13+1920];
ld.shared.f32 f937, [r13+2048];
ld.shared.f32 f938, [r13+2176];
ld.shared.f32 f939, [r13+2304];
ld.shared.f32 f940, [r13+2432];
ld.shared.f32 f941, [r13+2560];
ld.shared.f32 f942, [r13+2688];
ld.shared.f32 f943, [r13+2816];
ld.shared.f32 f944, [r13+2944];
ld.shared.f32 f945, [r13+3072];
ld.shared.f32 f946, [r13+3200];
ld.shared.f32 f947, [r13+3328];
ld.shared.f32 f948, [r13+3456];
ld.shared.f32 f949, [r13+3584];
ld.shared.f32 f950, [r13+3712];
ld.shared.f32 f951, [r13+3840];
ld.shared.f32 f952, [r13+3968];
barrier.sync 0;
st.shared.v4.f32 [r12], {f1656, f620, f630, f640};
st.shared.v4.f32 [r12+16], {f650, f660, f670, f680};
st.shared.v4.f32 [r12+32], {f690, f700, f710, f720};
st.shared.v4.f32 [r12+48], {f730, f740, f750, f760};
st.shared.v4.f32 [r12+64], {f770, f780, f790, f800};
st.shared.v4.f32 [r12+80], {f810, f820, f830, f840};
st.shared.v4.f32 [r12+96], {f850, f860, f870, f880};
st.shared.v4.f32 [r12+112], {f890, f900, f910, f920};
barrier.sync 0;
ld.shared.f32 f953, [r13];
ld.shared.f32 f954, [r13+128];
ld.shared.f32 f955, [r13+256];
ld.shared.f32 f956, [r13+384];
ld.shared.f32 f957, [r13+512];
ld.shared.f32 f958, [r13+640];
ld.shared.f32 f959, [r13+768];
ld.shared.f32 f960, [r13+896];
ld.shared.f32 f961, [r13+1024];
ld.shared.f32 f962, [r13+1152];
ld.shared.f32 f963, [r13+1280];
ld.shared.f32 f964, [r13+1408];
ld.shared.f32 f965, [r13+1536];
ld.shared.f32 f966, [r13+1664];
ld.shared.f32 f967, [r13+1792];
ld.shared.f32 f968, [r13+1920];
ld.shared.f32 f969, [r13+2048];
ld.shared.f32 f970, [r13+2176];
ld.shared.f32 f971, [r13+2304];
ld.shared.f32 f972, [r13+2432];
ld.shared.f32 f973, [r13+2560];
ld.shared.f32 f974, [r13+2688];
ld.shared.f32 f975, [r13+2816];
ld.shared.f32 f976, [r13+2944];
ld.shared.f32 f977, [r13+3072];
ld.shared.f32 f978, [r13+3200];
ld.shared.f32 f979, [r13+3328];
ld.shared.f32 f980, [r13+3456];
ld.shared.f32 f981, [r13+3584];
ld.shared.f32 f982, [r13+3712];
ld.shared.f32 f983, [r13+3840];
ld.shared.f32 f984, [r13+3968];
add.f32 f985, f921, f937;
sub.f32 f987, f921, f937;
add.f32 f1596, f953, f969;
sub.f32 f988, f953, f969;
add.f32 f989, f929, f945;
sub.f32 f991, f929, f945;
add.f32 f1595, f961, f977;
sub.f32 f992, f961, f977;
add.f32 f993, f985, f989;
sub.f32 f995, f985, f989;
add.f32 f1594, f1596, f1595;
sub.f32 f996, f1596, f1595;
sub.f32 f997, f987, f992;
add.f32 f999, f987, f992;
add.f32 f1593, f988, f991;
sub.f32 f1000, f988, f991;
add.f32 f1001, f925, f941;
sub.f32 f1003, f925, f941;
add.f32 f1592, f957, f973;
sub.f32 f1004, f957, f973;
add.f32 f1005, f933, f949;
sub.f32 f1007, f933, f949;
add.f32 f1591, f965, f981;
sub.f32 f1008, f965, f981;
add.f32 f1009, f1001, f1005;
sub.f32 f1011, f1001, f1005;
add.f32 f1590, f1592, f1591;
sub.f32 f1012, f1592, f1591;
sub.f32 f1013, f1003, f1008;
add.f32 f1015, f1003, f1008;
add.f32 f1589, f1004, f1007;
sub.f32 f1016, f1004, f1007;
mul.f32 f1017, f1013, 0f3F3504F3;
mul.f32 f1018, f1589, 0f3F3504F3;
sub.f32 f1019, f1017, f1018;
add.f32 f1020, f1017, f1018;
mul.f32 f1587, f1015, 0fBF3504F3;
mul.f32 f1588, f1016, 0f3F3504F3;
sub.f32 f1023, f1587, f1588;
mul.f32 f1024, f1016, 0fBF3504F3;
fma.rn.f32 f1025, f1015, 0f3F3504F3, f1024;
add.f32 f1026, f993, f1009;
sub.f32 f1028, f993, f1009;
add.f32 f1586, f1594, f1590;
sub.f32 f1029, f1594, f1590;
add.f32 f1030, f997, f1019;
sub.f32 f1032, f997, f1019;
add.f32 f1585, f1593, f1020;
sub.f32 f1033, f1593, f1020;
sub.f32 f1034, f995, f1012;
add.f32 f1036, f995, f1012;
add.f32 f1584, f996, f1011;
sub.f32 f1037, f996, f1011;
add.f32 f1038, f999, f1023;
sub.f32 f1040, f999, f1023;
add.f32 f1583, f1000, f1025;
sub.f32 f1041, f1000, f1025;
add.f32 f1042, f923, f939;
sub.f32 f1044, f923, f939;
add.f32 f1582, f955, f971;
sub.f32 f1045, f955, f971;
add.f32 f1046, f931, f947;
sub.f32 f1048, f931, f947;
add.f32 f1581, f963, f979;
sub.f32 f1049, f963, f979;
add.f32 f1050, f1042, f1046;
sub.f32 f1052, f1042, f1046;
add.f32 f1580, f1582, f1581;
sub.f32 f1053, f1582, f1581;
sub.f32 f1054, f1044, f1049;
add.f32 f1056, f1044, f1049;
add.f32 f1579, f1045, f1048;
sub.f32 f1057, f1045, f1048;
add.f32 f1058, f927, f943;
sub.f32 f1060, f927, f943;
add.f32 f1578, f959, f975;
sub.f32 f1061, f959, f975;
add.f32 f1062, f935, f951;
sub.f32 f1064, f935, f951;
add.f32 f1577, f967, f983;
sub.f32 f1065, f967, f983;
add.f32 f1066, f1058, f1062;
sub.f32 f1068, f1058, f1062;
add.f32 f1576, f1578, f1577;
sub.f32 f1069, f1578, f1577;
sub.f32 f1070, f1060, f1065;
add.f32 f1072, f1060, f1065;
add.f32 f1575, f1061, f1064;
sub.f32 f1073, f1061, f1064;
mul.f32 f1074, f1070, 0f3F3504F3;
mul.f32 f1075, f1575, 0f3F3504F3;
sub.f32 f1076, f1074, f1075;
add.f32 f1077, f1074, f1075;
mul.f32 f1573, f1072, 0fBF3504F3;
mul.f32 f1574, f1073, 0f3F3504F3;
sub.f32 f1080, f1573, f1574;
mul.f32 f1081, f1073, 0fBF3504F3;
fma.rn.f32 f1082, f1072, 0f3F3504F3, f1081;
add.f32 f1083, f1050, f1066;
sub.f32 f1085, f1050, f1066;
add.f32 f1572, f1580, f1576;
sub.f32 f1086, f1580, f1576;
add.f32 f1087, f1054, f1076;
sub.f32 f1089, f1054, f1076;
add.f32 f1571, f1579, f1077;
sub.f32 f1090, f1579, f1077;
sub.f32 f1091, f1052, f1069;
add.f32 f1093, f1052, f1069;
add.f32 f1570, f1053, f1068;
sub.f32 f1094, f1053, f1068;
add.f32 f1095, f1056, f1080;
sub.f32 f1097, f1056, f1080;
add.f32 f1569, f1057, f1082;
sub.f32 f1098, f1057, f1082;
mul.f32 f1567, f1087, 0f3F6C835E;
mul.f32 f1568, f1571, 0f3EC3EF15;
sub.f32 f1101, f1567, f1568;
mul.f32 f1102, f1571, 0f3F6C835E;
fma.rn.f32 f1103, f1087, 0f3EC3EF15, f1102;
mul.f32 f1104, f1091, 0f3F3504F3;
mul.f32 f1105, f1570, 0f3F3504F3;
sub.f32 f1106, f1104, f1105;
add.f32 f1107, f1104, f1105;
mul.f32 f1565, f1095, 0f3EC3EF15;
mul.f32 f1566, f1569, 0f3F6C835E;
sub.f32 f1110, f1565, f1566;
mul.f32 f1111, f1569, 0f3EC3EF15;
fma.rn.f32 f1112, f1095, 0f3F6C835E, f1111;
mul.f32 f1563, f1089, 0fBEC3EF15;
mul.f32 f1564, f1090, 0f3F6C835E;
sub.f32 f1115, f1563, f1564;
mul.f32 f1116, f1090, 0fBEC3EF15;
fma.rn.f32 f1117, f1089, 0f3F6C835E, f1116;
mul.f32 f1561, f1093, 0fBF3504F3;
mul.f32 f1562, f1094, 0f3F3504F3;
sub.f32 f1120, f1561, f1562;
mul.f32 f1121, f1094, 0fBF3504F3;
fma.rn.f32 f1122, f1093, 0f3F3504F3, f1121;
mul.f32 f1559, f1097, 0fBF6C835E;
mul.f32 f1560, f1098, 0f3EC3EF15;
sub.f32 f1125, f1559, f1560;
mul.f32 f1126, f1098, 0fBF6C835E;
fma.rn.f32 f1127, f1097, 0f3EC3EF15, f1126;
add.f32 f1128, f1026, f1083;
sub.f32 f1130, f1026, f1083;
add.f32 f1558, f1586, f1572;
sub.f32 f1131, f1586, f1572;
add.f32 f1132, f1030, f1101;
sub.f32 f1134, f1030, f1101;
add.f32 f1557, f1585, f1103;
sub.f32 f1135, f1585, f1103;
add.f32 f1136, f1034, f1106;
sub.f32 f1138, f1034, f1106;
add.f32 f1556, f1584, f1107;
sub.f32 f1139, f1584, f1107;
add.f32 f1140, f1038, f1110;
sub.f32 f1142, f1038, f1110;
add.f32 f1555, f1583, f1112;
sub.f32 f1143, f1583, f1112;
sub.f32 f1144, f1028, f1086;
add.f32 f1146, f1028, f1086;
add.f32 f1554, f1029, f1085;
sub.f32 f1147, f1029, f1085;
add.f32 f1148, f1032, f1115;
sub.f32 f1150, f1032, f1115;
add.f32 f1553, f1033, f1117;
sub.f32 f1151, f1033, f1117;
add.f32 f1152, f1036, f1120;
sub.f32 f1154, f1036, f1120;
add.f32 f1552, f1037, f1122;
sub.f32 f1155, f1037, f1122;
add.f32 f1156, f1040, f1125;
sub.f32 f1158, f1040, f1125;
add.f32 f1551, f1041, f1127;
sub.f32 f1159, f1041, f1127;
add.f32 f1160, f922, f938;
sub.f32 f1162, f922, f938;
add.f32 f1550, f954, f970;
sub.f32 f1163, f954, f970;
add.f32 f1164, f930, f946;
sub.f32 f1166, f930, f946;
add.f32 f1549, f962, f978;
sub.f32 f1167, f962, f978;
add.f32 f1168, f1160, f1164;
sub.f32 f1170, f1160, f1164;
add.f32 f1548, f1550, f1549;
sub.f32 f1171, f1550, f1549;
sub.f32 f1172, f1162, f1167;
add.f32 f1174, f1162, f1167;
add.f32 f1547, f1163, f1166;
sub.f32 f1175, f1163, f1166;
add.f32 f1176, f926, f942;
sub.f32 f1178, f926, f942;
add.f32 f1546, f958, f974;
sub.f32 f1179, f958, f974;
add.f32 f1180, f934, f950;
sub.f32 f1182, f934, f950;
add.f32 f1545, f966, f982;
sub.f32 f1183, f966, f982;
add.f32 f1184, f1176, f1180;
sub.f32 f1186, f1176, f1180;
add.f32 f1544, f1546, f1545;
sub.f32 f1187, f1546, f1545;
sub.f32 f1188, f1178, f1183;
add.f32 f1190, f1178, f1183;
add.f32 f1543, f1179, f1182;
sub.f32 f1191, f1179, f1182;
mul.f32 f1192, f1188, 0f3F3504F3;
mul.f32 f1193, f1543, 0f3F3504F3;
sub.f32 f1194, f1192, f1193;
add.f32 f1195, f1192, f1193;
mul.f32 f1541, f1190, 0fBF3504F3;
mul.f32 f1542, f1191, 0f3F3504F3;
sub.f32 f1198, f1541, f1542;
mul.f32 f1199, f1191, 0fBF3504F3;
fma.rn.f32 f1200, f1190, 0f3F3504F3, f1199;
add.f32 f1201, f1168, f1184;
sub.f32 f1203, f1168, f1184;
add.f32 f1540, f1548, f1544;
sub.f32 f1204, f1548, f1544;
add.f32 f1205, f1172, f1194;
sub.f32 f1207, f1172, f1194;
add.f32 f1539, f1547, f1195;
sub.f32 f1208, f1547, f1195;
sub.f32 f1209, f1170, f1187;
add.f32 f1211, f1170, f1187;
add.f32 f1538, f1171, f1186;
sub.f32 f1212, f1171, f1186;
add.f32 f1213, f1174, f1198;
sub.f32 f1215, f1174, f1198;
add.f32 f1537, f1175, f1200;
sub.f32 f1216, f1175, f1200;
add.f32 f1217, f924, f940;
sub.f32 f1219, f924, f940;
add.f32 f1536, f956, f972;
sub.f32 f1220, f956, f972;
add.f32 f1221, f932, f948;
sub.f32 f1223, f932, f948;
add.f32 f1535, f964, f980;
sub.f32 f1224, f964, f980;
add.f32 f1225, f1217, f1221;
sub.f32 f1227, f1217, f1221;
add.f32 f1534, f1536, f1535;
sub.f32 f1228, f1536, f1535;
sub.f32 f1229, f1219, f1224;
add.f32 f1231, f1219, f1224;
add.f32 f1533, f1220, f1223;
sub.f32 f1232, f1220, f1223;
add.f32 f1233, f928, f944;
sub.f32 f1235, f928, f944;
add.f32 f1532, f960, f976;
sub.f32 f1236, f960, f976;
add.f32 f1237, f936, f952;
sub.f32 f1239, f936, f952;
add.f32 f1531, f968, f984;
sub.f32 f1240, f968, f984;
add.f32 f1241, f1233, f1237;
sub.f32 f1243, f1233, f1237;
add.f32 f1530, f1532, f1531;
sub.f32 f1244, f1532, f1531;
sub.f32 f1245, f1235, f1240;
add.f32 f1247, f1235, f1240;
add.f32 f1529, f1236, f1239;
sub.f32 f1248, f1236, f1239;
mul.f32 f1249, f1245, 0f3F3504F3;
mul.f32 f1250, f1529, 0f3F3504F3;
sub.f32 f1251, f1249, f1250;
add.f32 f1252, f1249, f1250;
mul.f32 f1527, f1247, 0fBF3504F3;
mul.f32 f1528, f1248, 0f3F3504F3;
sub.f32 f1255, f1527, f1528;
mul.f32 f1256, f1248, 0fBF3504F3;
fma.rn.f32 f1257, f1247, 0f3F3504F3, f1256;
add.f32 f1258, f1225, f1241;
sub.f32 f1260, f1225, f1241;
add.f32 f1526, f1534, f1530;
sub.f32 f1261, f1534, f1530;
add.f32 f1262, f1229, f1251;
sub.f32 f1264, f1229, f1251;
add.f32 f1525, f1533, f1252;
sub.f32 f1265, f1533, f1252;
sub.f32 f1266, f1227, f1244;
add.f32 f1268, f1227, f1244;
add.f32 f1524, f1228, f1243;
sub.f32 f1269, f1228, f1243;
add.f32 f1270, f1231, f1255;
sub.f32 f1272, f1231, f1255;
add.f32 f1523, f1232, f1257;
sub.f32 f1273, f1232, f1257;
mul.f32 f1521, f1262, 0f3F6C835E;
mul.f32 f1522, f1525, 0f3EC3EF15;
sub.f32 f1276, f1521, f1522;
mul.f32 f1277, f1525, 0f3F6C835E;
fma.rn.f32 f1278, f1262, 0f3EC3EF15, f1277;
mul.f32 f1279, f1266, 0f3F3504F3;
mul.f32 f1280, f1524, 0f3F3504F3;
sub.f32 f1281, f1279, f1280;
add.f32 f1282, f1279, f1280;
mul.f32 f1284, f1523, 0f3F6C835E;
mul.f32 f1520, f1270, 0f3EC3EF15;
sub.f32 f1285, f1520, f1284;
mul.f32 f1286, f1523, 0f3EC3EF15;
fma.rn.f32 f1287, f1270, 0f3F6C835E, f1286;
mul.f32 f1289, f1265, 0f3F6C835E;
mul.f32 f1519, f1264, 0fBEC3EF15;
sub.f32 f1290, f1519, f1289;
mul.f32 f1291, f1265, 0fBEC3EF15;
fma.rn.f32 f1292, f1264, 0f3F6C835E, f1291;
mul.f32 f1517, f1268, 0fBF3504F3;
mul.f32 f1518, f1269, 0f3F3504F3;
sub.f32 f1295, f1517, f1518;
mul.f32 f1296, f1269, 0fBF3504F3;
fma.rn.f32 f1297, f1268, 0f3F3504F3, f1296;
mul.f32 f1515, f1272, 0fBF6C835E;
mul.f32 f1516, f1273, 0f3EC3EF15;
sub.f32 f1300, f1515, f1516;
mul.f32 f1301, f1273, 0fBF6C835E;
fma.rn.f32 f1302, f1272, 0f3EC3EF15, f1301;
add.f32 f1303, f1201, f1258;
sub.f32 f1305, f1201, f1258;
add.f32 f1514, f1540, f1526;
sub.f32 f1306, f1540, f1526;
add.f32 f1307, f1205, f1276;
sub.f32 f1309, f1205, f1276;
add.f32 f1513, f1539, f1278;
sub.f32 f1310, f1539, f1278;
add.f32 f1311, f1209, f1281;
sub.f32 f1313, f1209, f1281;
add.f32 f1512, f1538, f1282;
sub.f32 f1314, f1538, f1282;
add.f32 f1315, f1213, f1285;
sub.f32 f1317, f1213, f1285;
add.f32 f1511, f1537, f1287;
sub.f32 f1318, f1537, f1287;
sub.f32 f1319, f1203, f1261;
add.f32 f1321, f1203, f1261;
add.f32 f1510, f1204, f1260;
sub.f32 f1322, f1204, f1260;
add.f32 f1323, f1207, f1290;
sub.f32 f1325, f1207, f1290;
add.f32 f1509, f1208, f1292;
sub.f32 f1326, f1208, f1292;
add.f32 f1327, f1211, f1295;
sub.f32 f1329, f1211, f1295;
add.f32 f1508, f1212, f1297;
sub.f32 f1330, f1212, f1297;
add.f32 f1331, f1215, f1300;
sub.f32 f1333, f1215, f1300;
add.f32 f1507, f1216, f1302;
sub.f32 f1334, f1216, f1302;
mul.f32 f1336, f1513, 0f3E47C5C2;
mul.f32 f1506, f1307, 0f3F7B14BE;
sub.f32 f1337, f1506, f1336;
mul.f32 f1338, f1513, 0f3F7B14BE;
fma.rn.f32 f1339, f1307, 0f3E47C5C2, f1338;
mul.f32 f1341, f1512, 0f3EC3EF15;
mul.f32 f1505, f1311, 0f3F6C835E;
sub.f32 f1342, f1505, f1341;
mul.f32 f1343, f1512, 0f3F6C835E;
fma.rn.f32 f1344, f1311, 0f3EC3EF15, f1343;
mul.f32 f1503, f1315, 0f3F54DB31;
mul.f32 f1504, f1511, 0f3F0E39DA;
sub.f32 f1347, f1503, f1504;
mul.f32 f1348, f1511, 0f3F54DB31;
fma.rn.f32 f1349, f1315, 0f3F0E39DA, f1348;
mul.f32 f1350, f1319, 0f3F3504F3;
mul.f32 f1351, f1510, 0f3F3504F3;
sub.f32 f1352, f1350, f1351;
add.f32 f1353, f1350, f1351;
mul.f32 f1355, f1509, 0f3F54DB31;
mul.f32 f1502, f1323, 0f3F0E39DA;
sub.f32 f1356, f1502, f1355;
mul.f32 f1357, f1509, 0f3F0E39DA;
fma.rn.f32 f1358, f1323, 0f3F54DB31, f1357;
mul.f32 f1360, f1508, 0f3F6C835E;
mul.f32 f1501, f1327, 0f3EC3EF15;
sub.f32 f1361, f1501, f1360;
mul.f32 f1362, f1508, 0f3EC3EF15;
fma.rn.f32 f1363, f1327, 0f3F6C835E, f1362;
mul.f32 f1499, f1331, 0f3E47C5C2;
mul.f32 f1500, f1507, 0f3F7B14BE;
sub.f32 f1366, f1499, f1500;
mul.f32 f1367, f1507, 0f3E47C5C2;
fma.rn.f32 f1368, f1331, 0f3F7B14BE, f1367;
mul.f32 f1497, f1309, 0fBE47C5C2;
mul.f32 f1498, f1310, 0f3F7B14BE;
sub.f32 f1371, f1497, f1498;
mul.f32 f1372, f1310, 0fBE47C5C2;
fma.rn.f32 f1373, f1309, 0f3F7B14BE, f1372;
mul.f32 f1495, f1313, 0fBEC3EF15;
mul.f32 f1496, f1314, 0f3F6C835E;
sub.f32 f1376, f1495, f1496;
mul.f32 f1377, f1314, 0fBEC3EF15;
fma.rn.f32 f1378, f1313, 0f3F6C835E, f1377;
mul.f32 f1493, f1317, 0fBF0E39DA;
mul.f32 f1494, f1318, 0f3F54DB31;
sub.f32 f1381, f1493, f1494;
mul.f32 f1382, f1318, 0fBF0E39DA;
fma.rn.f32 f1383, f1317, 0f3F54DB31, f1382;
mul.f32 f1385, f1322, 0f3F3504F3;
mul.f32 f1492, f1321, 0fBF3504F3;
sub.f32 f1386, f1492, f1385;
mul.f32 f1387, f1322, 0fBF3504F3;
fma.rn.f32 f1388, f1321, 0f3F3504F3, f1387;
mul.f32 f1390, f1326, 0f3F0E39DA;
mul.f32 f1491, f1325, 0fBF54DB31;
sub.f32 f1391, f1491, f1390;
mul.f32 f1392, f1326, 0fBF54DB31;
fma.rn.f32 f1393, f1325, 0f3F0E39DA, f1392;
mul.f32 f1395, f1330, 0f3EC3EF15;
mul.f32 f1490, f1329, 0fBF6C835E;
sub.f32 f1396, f1490, f1395;
mul.f32 f1397, f1330, 0fBF6C835E;
fma.rn.f32 f1398, f1329, 0f3EC3EF15, f1397;
mul.f32 f1400, f1334, 0f3E47C5C2;
mul.f32 f1489, f1333, 0fBF7B14BE;
sub.f32 f1401, f1489, f1400;
mul.f32 f1402, f1334, 0fBF7B14BE;
fma.rn.f32 f1403, f1333, 0f3E47C5C2, f1402;
add.f32 %1, f1558, f1514;
add.f32 %0, f1128, f1303;
add.f32 %2, f1132, f1337;
add.f32 %3, f1557, f1339;
add.f32 %4, f1136, f1342;
add.f32 %5, f1556, f1344;
add.f32 %6, f1140, f1347;
add.f32 %7, f1555, f1349;
add.f32 %9, f1554, f1353;
add.f32 %8, f1144, f1352;
add.f32 %11, f1553, f1358;
add.f32 %10, f1148, f1356;
add.f32 %12, f1152, f1361;
add.f32 %13, f1552, f1363;
add.f32 %14, f1156, f1366;
add.f32 %15, f1551, f1368;
sub.f32 %16, f1130, f1306;
add.f32 %17, f1131, f1305;
add.f32 %18, f1134, f1371;
add.f32 %19, f1135, f1373;
add.f32 %21, f1139, f1378;
add.f32 %20, f1138, f1376;
add.f32 %23, f1143, f1383;
add.f32 %22, f1142, f1381;
add.f32 %25, f1147, f1388;
add.f32 %24, f1146, f1386;
add.f32 %26, f1150, f1391;
add.f32 %27, f1151, f1393;
add.f32 %28, f1154, f1396;
add.f32 %29, f1155, f1398;
add.f32 %30, f1158, f1401;
add.f32 %31, f1159, f1403;
sub.f32 %32, f1128, f1303;
sub.f32 %33, f1558, f1514;
sub.f32 %35, f1557, f1339;
sub.f32 %34, f1132, f1337;
sub.f32 %37, f1556, f1344;
sub.f32 %36, f1136, f1342;
sub.f32 %39, f1555, f1349;
sub.f32 %38, f1140, f1347;
sub.f32 %41, f1554, f1353;
sub.f32 %40, f1144, f1352;
sub.f32 %43, f1553, f1358;
sub.f32 %42, f1148, f1356;
sub.f32 %45, f1552, f1363;
sub.f32 %44, f1152, f1361;
sub.f32 %47, f1551, f1368;
sub.f32 %46, f1156, f1366;
sub.f32 %49, f1131, f1305;
add.f32 %48, f1130, f1306;
sub.f32 %51, f1135, f1373;
sub.f32 %50, f1134, f1371;
sub.f32 %53, f1139, f1378;
sub.f32 %52, f1138, f1376;
sub.f32 %55, f1143, f1383;
sub.f32 %54, f1142, f1381;
sub.f32 %57, f1147, f1388;
sub.f32 %56, f1146, f1386;
sub.f32 %59, f1151, f1393;
sub.f32 %58, f1150, f1391;
sub.f32 %61, f1155, f1398;
sub.f32 %60, f1154, f1396;
sub.f32 %63, f1159, f1403;
sub.f32 %62, f1158, f1401;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_1024), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<284, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<523>;
.reg .b32 r<27>;
.reg .b64 rd<12>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 13;
mov.u32 r3, %16;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f32 f33, %20, %30;
add.f32 f34, %21, %32;
sub.f32 f35, %20, %30;
sub.f32 f36, %21, %32;
add.f32 f37, %25, %36;
add.f32 f38, %27, %37;
sub.f32 f39, %25, %36;
sub.f32 f40, %27, %37;
add.f32 f41, f33, f37;
add.f32 f42, f34, f38;
sub.f32 f43, f33, f37;
sub.f32 f44, f34, f38;
sub.f32 f45, f35, f40;
add.f32 f46, f36, f39;
add.f32 f47, f35, f40;
sub.f32 f48, f36, f39;
add.f32 f49, %22, %33;
add.f32 f50, %24, %35;
sub.f32 f51, %22, %33;
sub.f32 f52, %24, %35;
add.f32 f53, %28, %38;
add.f32 f54, %29, %39;
sub.f32 f55, %28, %38;
sub.f32 f56, %29, %39;
add.f32 f57, f49, f53;
add.f32 f58, f50, f54;
sub.f32 f59, f49, f53;
sub.f32 f60, f50, f54;
sub.f32 f61, f51, f56;
add.f32 f62, f52, f55;
add.f32 f63, f51, f56;
sub.f32 f64, f52, f55;
mul.f32 f65, f61, 0f3F3504F3;
mul.f32 f66, f62, 0f3F3504F3;
sub.f32 f67, f65, f66;
add.f32 f68, f65, f66;
mul.f32 f69, f63, 0fBF3504F3;
mul.f32 f70, f64, 0f3F3504F3;
sub.f32 f71, f69, f70;
mul.f32 f72, f64, 0fBF3504F3;
fma.rn.f32 f73, f63, 0f3F3504F3, f72;
sub.f32 f74, f41, f57;
sub.f32 f75, f42, f58;
add.f32 f76, f45, f67;
add.f32 f77, f46, f68;
sub.f32 f78, f45, f67;
sub.f32 f79, f46, f68;
sub.f32 f80, f43, f60;
add.f32 f81, f44, f59;
add.f32 f82, f43, f60;
sub.f32 f83, f44, f59;
add.f32 f84, f47, f71;
add.f32 f85, f48, f73;
sub.f32 f86, f47, f71;
sub.f32 f87, f48, f73;
and.b32 r6, r5, 127;
shl.b32 r7, r5, 6;
and.b32 r8, r7, -8192;
add.s32 r9, r4, r8;
shl.b32 r10, r5, 3;
cvt.u64.u32 rd2, r10;
and.b64 rd3, rd2, 1016;
mov.u64 rd4, %17;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f88, f89}, [rd5];
mul.f32 f92, f77, f89;
mul.f32 f93, f76, f89;
mul.f32 f94, f88, f77;
mul.f32 f95, f88, f88;
mul.f32 f96, f89, f89;
sub.f32 f97, f95, f96;
mul.f32 f98, f89, f88;
fma.rn.f32 f99, f89, f88, f98;
mul.f32 f100, f81, f99;
mul.f32 f101, f80, f99;
mul.f32 f102, f97, f81;
mul.f32 f103, f88, f97;
mul.f32 f104, f89, f99;
sub.f32 f105, f103, f104;
mul.f32 f106, f88, f99;
fma.rn.f32 f107, f89, f97, f106;
mul.f32 f108, f85, f107;
mul.f32 f109, f84, f107;
mul.f32 f110, f105, f85;
mul.f32 f111, f88, f105;
mul.f32 f112, f89, f107;
sub.f32 f113, f111, f112;
mul.f32 f114, f88, f107;
fma.rn.f32 f115, f89, f105, f114;
mul.f32 f116, f75, f115;
mul.f32 f117, f74, f115;
mul.f32 f118, f113, f75;
mul.f32 f119, f88, f113;
mul.f32 f120, f89, f115;
sub.f32 f121, f119, f120;
mul.f32 f122, f88, f115;
fma.rn.f32 f123, f89, f113, f122;
mul.f32 f124, f79, f123;
mul.f32 f125, f78, f123;
mul.f32 f126, f121, f79;
mul.f32 f127, f88, f121;
mul.f32 f128, f89, f123;
sub.f32 f129, f127, f128;
mul.f32 f130, f88, f123;
fma.rn.f32 f131, f89, f121, f130;
mul.f32 f132, f83, f131;
mul.f32 f133, f82, f131;
mul.f32 f134, f129, f83;
mul.f32 f135, f88, f129;
mul.f32 f136, f89, f131;
sub.f32 f137, f135, f136;
mul.f32 f138, f88, f131;
fma.rn.f32 f139, f89, f129, f138;
mul.f32 f140, f87, f139;
mul.f32 f141, f86, f139;
mul.f32 f142, f137, f87;
barrier.sync 0;
and.b32 r11, r7, 8128;
add.s32 r12, r9, r11;
add.f32 f143, f42, f58;
add.f32 f144, f41, f57;
fma.rn.f32 f145, f88, f76, f92;
sub.f32 f146, f94, f93;
st.shared.v4.f32 [r12], {f144, f143, f145, f146};
fma.rn.f32 f147, f97, f80, f100;
sub.f32 f148, f102, f101;
sub.f32 f149, f110, f109;
fma.rn.f32 f150, f105, f84, f108;
st.shared.v4.f32 [r12+16], {f147, f148, f150, f149};
fma.rn.f32 f151, f113, f74, f116;
sub.f32 f152, f118, f117;
fma.rn.f32 f153, f121, f78, f124;
sub.f32 f154, f126, f125;
st.shared.v4.f32 [r12+32], {f151, f152, f153, f154};
fma.rn.f32 f155, f129, f82, f132;
sub.f32 f156, f134, f133;
fma.rn.f32 f157, f137, f86, f140;
sub.f32 f158, f142, f141;
st.shared.v4.f32 [r12+48], {f155, f156, f157, f158};
barrier.sync 0;
mad.lo.s32 r13, r6, -56, r12;
ld.shared.v2.f32 {f159, f160}, [r13];
ld.shared.v2.f32 {f163, f164}, [r13+1024];
ld.shared.v2.f32 {f167, f168}, [r13+2048];
ld.shared.v2.f32 {f171, f172}, [r13+3072];
ld.shared.v2.f32 {f175, f176}, [r13+4096];
ld.shared.v2.f32 {f179, f180}, [r13+5120];
ld.shared.v2.f32 {f183, f184}, [r13+6144];
ld.shared.v2.f32 {f187, f188}, [r13+7168];
add.f32 f191, f159, f175;
add.f32 f192, f160, f176;
sub.f32 f193, f159, f175;
sub.f32 f194, f160, f176;
add.f32 f195, f167, f183;
add.f32 f196, f168, f184;
sub.f32 f197, f167, f183;
sub.f32 f198, f168, f184;
add.f32 f199, f191, f195;
add.f32 f200, f192, f196;
sub.f32 f201, f191, f195;
sub.f32 f202, f192, f196;
sub.f32 f203, f193, f198;
add.f32 f204, f194, f197;
add.f32 f205, f193, f198;
sub.f32 f206, f194, f197;
add.f32 f207, f163, f179;
add.f32 f208, f164, f180;
sub.f32 f209, f163, f179;
sub.f32 f210, f164, f180;
add.f32 f211, f171, f187;
add.f32 f212, f172, f188;
sub.f32 f213, f171, f187;
sub.f32 f214, f172, f188;
add.f32 f215, f207, f211;
add.f32 f216, f208, f212;
sub.f32 f217, f207, f211;
sub.f32 f218, f208, f212;
sub.f32 f219, f209, f214;
add.f32 f220, f210, f213;
add.f32 f221, f209, f214;
sub.f32 f222, f210, f213;
mul.f32 f223, f219, 0f3F3504F3;
mul.f32 f224, f220, 0f3F3504F3;
sub.f32 f225, f223, f224;
add.f32 f226, f223, f224;
mul.f32 f227, f221, 0fBF3504F3;
mul.f32 f228, f222, 0f3F3504F3;
sub.f32 f229, f227, f228;
mul.f32 f230, f222, 0fBF3504F3;
fma.rn.f32 f231, f221, 0f3F3504F3, f230;
sub.f32 f232, f199, f215;
sub.f32 f233, f200, f216;
add.f32 f234, f203, f225;
add.f32 f235, f204, f226;
sub.f32 f236, f203, f225;
sub.f32 f237, f204, f226;
sub.f32 f238, f201, f218;
add.f32 f239, f202, f217;
add.f32 f240, f201, f218;
sub.f32 f241, f202, f217;
add.f32 f242, f205, f229;
add.f32 f243, f206, f231;
sub.f32 f244, f205, f229;
sub.f32 f245, f206, f231;
and.b32 r14, r5, 120;
cvt.u64.u32 rd6, r14;
mov.u64 rd7, %18;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f246, f247}, [rd8];
mul.f32 f250, f235, f247;
mul.f32 f251, f234, f247;
mul.f32 f252, f246, f235;
mul.f32 f253, f246, f246;
mul.f32 f254, f247, f247;
sub.f32 f255, f253, f254;
mul.f32 f256, f247, f246;
fma.rn.f32 f257, f247, f246, f256;
mul.f32 f258, f239, f257;
mul.f32 f259, f238, f257;
mul.f32 f260, f255, f239;
mul.f32 f261, f246, f255;
mul.f32 f262, f247, f257;
sub.f32 f263, f261, f262;
mul.f32 f264, f246, f257;
fma.rn.f32 f265, f247, f255, f264;
mul.f32 f266, f243, f265;
mul.f32 f267, f242, f265;
mul.f32 f268, f263, f243;
mul.f32 f269, f246, f263;
mul.f32 f270, f247, f265;
sub.f32 f271, f269, f270;
mul.f32 f272, f246, f265;
fma.rn.f32 f273, f247, f263, f272;
mul.f32 f274, f233, f273;
mul.f32 f275, f232, f273;
mul.f32 f276, f271, f233;
mul.f32 f277, f246, f271;
mul.f32 f278, f247, f273;
sub.f32 f279, f277, f278;
mul.f32 f280, f246, f273;
fma.rn.f32 f281, f247, f271, f280;
mul.f32 f282, f237, f281;
mul.f32 f283, f236, f281;
mul.f32 f284, f279, f237;
mul.f32 f285, f246, f279;
mul.f32 f286, f247, f281;
sub.f32 f287, f285, f286;
mul.f32 f288, f246, f281;
fma.rn.f32 f289, f247, f279, f288;
mul.f32 f290, f241, f289;
mul.f32 f291, f240, f289;
mul.f32 f292, f287, f241;
mul.f32 f293, f246, f287;
mul.f32 f294, f247, f289;
sub.f32 f295, f293, f294;
mul.f32 f296, f246, f289;
fma.rn.f32 f297, f247, f287, f296;
mul.f32 f298, f245, f297;
mul.f32 f299, f244, f297;
mul.f32 f300, f295, f245;
and.b32 r15, r10, 56;
add.s32 r16, r9, r15;
barrier.sync 0;
and.b32 r17, r7, 7680;
add.s32 r18, r16, r17;
add.f32 f301, f200, f216;
add.f32 f302, f199, f215;
st.shared.v2.f32 [r18], {f302, f301};
fma.rn.f32 f303, f246, f234, f250;
sub.f32 f304, f252, f251;
st.shared.v2.f32 [r18+64], {f303, f304};
fma.rn.f32 f305, f255, f238, f258;
sub.f32 f306, f260, f259;
st.shared.v2.f32 [r18+128], {f305, f306};
fma.rn.f32 f307, f263, f242, f266;
sub.f32 f308, f268, f267;
st.shared.v2.f32 [r18+192], {f307, f308};
sub.f32 f309, f276, f275;
fma.rn.f32 f310, f271, f232, f274;
st.shared.v2.f32 [r18+256], {f310, f309};
fma.rn.f32 f311, f279, f236, f282;
sub.f32 f312, f284, f283;
st.shared.v2.f32 [r18+320], {f311, f312};
fma.rn.f32 f313, f287, f240, f290;
sub.f32 f314, f292, f291;
st.shared.v2.f32 [r18+384], {f313, f314};
fma.rn.f32 f315, f295, f244, f298;
sub.f32 f316, f300, f299;
st.shared.v2.f32 [r18+448], {f315, f316};
barrier.sync 0;
mad.lo.s32 r19, r14, -56, r18;
ld.shared.v2.f32 {f317, f318}, [r19];
ld.shared.v2.f32 {f321, f322}, [r19+1024];
ld.shared.v2.f32 {f325, f326}, [r19+2048];
ld.shared.v2.f32 {f329, f330}, [r19+3072];
ld.shared.v2.f32 {f333, f334}, [r19+4096];
ld.shared.v2.f32 {f337, f338}, [r19+5120];
ld.shared.v2.f32 {f341, f342}, [r19+6144];
ld.shared.v2.f32 {f345, f346}, [r19+7168];
add.f32 f349, f317, f333;
add.f32 f350, f318, f334;
sub.f32 f351, f317, f333;
sub.f32 f352, f318, f334;
add.f32 f353, f325, f341;
add.f32 f354, f326, f342;
sub.f32 f355, f325, f341;
sub.f32 f356, f326, f342;
add.f32 f357, f349, f353;
add.f32 f358, f350, f354;
sub.f32 f359, f349, f353;
sub.f32 f360, f350, f354;
sub.f32 f361, f351, f356;
add.f32 f362, f352, f355;
add.f32 f363, f351, f356;
sub.f32 f364, f352, f355;
add.f32 f365, f321, f337;
add.f32 f366, f322, f338;
sub.f32 f367, f321, f337;
sub.f32 f368, f322, f338;
add.f32 f369, f329, f345;
add.f32 f370, f330, f346;
sub.f32 f371, f329, f345;
sub.f32 f372, f330, f346;
add.f32 f373, f365, f369;
add.f32 f374, f366, f370;
sub.f32 f375, f365, f369;
sub.f32 f376, f366, f370;
sub.f32 f377, f367, f372;
add.f32 f378, f368, f371;
add.f32 f379, f367, f372;
sub.f32 f380, f368, f371;
mul.f32 f381, f377, 0f3F3504F3;
mul.f32 f382, f378, 0f3F3504F3;
sub.f32 f383, f381, f382;
add.f32 f384, f381, f382;
mul.f32 f385, f379, 0fBF3504F3;
mul.f32 f386, f380, 0f3F3504F3;
sub.f32 f387, f385, f386;
mul.f32 f388, f380, 0fBF3504F3;
fma.rn.f32 f389, f379, 0f3F3504F3, f388;
sub.f32 f390, f357, f373;
sub.f32 f391, f358, f374;
add.f32 f392, f361, f383;
add.f32 f393, f362, f384;
sub.f32 f394, f361, f383;
sub.f32 f395, f362, f384;
sub.f32 f396, f359, f376;
add.f32 f397, f360, f375;
add.f32 f398, f359, f376;
sub.f32 f399, f360, f375;
add.f32 f400, f363, f387;
add.f32 f401, f364, f389;
sub.f32 f402, f363, f387;
sub.f32 f403, f364, f389;
and.b32 r20, r5, 64;
bfe.u32 r21, r5, 6, 1;
mul.wide.u32 rd9, r21, 8;
mov.u64 rd10, %19;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f404, f405}, [rd11];
mul.f32 f408, f393, f405;
mul.f32 f409, f392, f405;
mul.f32 f410, f404, f393;
mul.f32 f411, f404, f404;
mul.f32 f412, f405, f405;
sub.f32 f413, f411, f412;
mul.f32 f414, f405, f404;
fma.rn.f32 f415, f405, f404, f414;
mul.f32 f416, f397, f415;
mul.f32 f417, f396, f415;
mul.f32 f418, f413, f397;
mul.f32 f419, f404, f413;
mul.f32 f420, f405, f415;
sub.f32 f421, f419, f420;
mul.f32 f422, f404, f415;
fma.rn.f32 f423, f405, f413, f422;
mul.f32 f424, f401, f423;
mul.f32 f425, f400, f423;
mul.f32 f426, f421, f401;
mul.f32 f427, f404, f421;
mul.f32 f428, f405, f423;
sub.f32 f429, f427, f428;
mul.f32 f430, f404, f423;
fma.rn.f32 f431, f405, f421, f430;
mul.f32 f432, f391, f431;
mul.f32 f433, f390, f431;
mul.f32 f434, f429, f391;
mul.f32 f435, f404, f429;
mul.f32 f436, f405, f431;
sub.f32 f437, f435, f436;
mul.f32 f438, f404, f431;
fma.rn.f32 f439, f405, f429, f438;
mul.f32 f440, f395, f439;
mul.f32 f441, f394, f439;
mul.f32 f442, f437, f395;
mul.f32 f443, f404, f437;
mul.f32 f444, f405, f439;
sub.f32 f445, f443, f444;
mul.f32 f446, f404, f439;
fma.rn.f32 f447, f405, f437, f446;
mul.f32 f448, f399, f447;
mul.f32 f449, f398, f447;
mul.f32 f450, f445, f399;
mul.f32 f451, f404, f445;
mul.f32 f452, f405, f447;
sub.f32 f453, f451, f452;
mul.f32 f454, f404, f447;
fma.rn.f32 f455, f405, f445, f454;
mul.f32 f456, f403, f455;
mul.f32 f457, f402, f455;
mul.f32 f458, f453, f403;
and.b32 r22, r10, 504;
add.s32 r23, r9, r22;
barrier.sync 0;
and.b32 r24, r7, 4096;
add.s32 r25, r23, r24;
add.f32 f459, f358, f374;
add.f32 f460, f357, f373;
st.shared.v2.f32 [r25], {f460, f459};
fma.rn.f32 f461, f404, f392, f408;
sub.f32 f462, f410, f409;
st.shared.v2.f32 [r25+512], {f461, f462};
fma.rn.f32 f463, f413, f396, f416;
sub.f32 f464, f418, f417;
st.shared.v2.f32 [r25+1024], {f463, f464};
fma.rn.f32 f465, f421, f400, f424;
sub.f32 f466, f426, f425;
st.shared.v2.f32 [r25+1536], {f465, f466};
sub.f32 f467, f434, f433;
fma.rn.f32 f468, f429, f390, f432;
st.shared.v2.f32 [r25+2048], {f468, f467};
fma.rn.f32 f469, f437, f394, f440;
sub.f32 f470, f442, f441;
st.shared.v2.f32 [r25+2560], {f469, f470};
fma.rn.f32 f471, f445, f398, f448;
sub.f32 f472, f450, f449;
st.shared.v2.f32 [r25+3072], {f471, f472};
fma.rn.f32 f473, f453, f402, f456;
sub.f32 f474, f458, f457;
st.shared.v2.f32 [r25+3584], {f473, f474};
barrier.sync 0;
mad.lo.s32 r26, r20, -56, r25;
ld.shared.v2.f32 {f475, f476}, [r26];
ld.shared.v2.f32 {f479, f480}, [r26+1024];
ld.shared.v2.f32 {f483, f484}, [r26+2048];
ld.shared.v2.f32 {f487, f488}, [r26+3072];
ld.shared.v2.f32 {f491, f492}, [r26+4096];
ld.shared.v2.f32 {f495, f496}, [r26+5120];
ld.shared.v2.f32 {f499, f500}, [r26+6144];
ld.shared.v2.f32 {f503, f504}, [r26+7168];
add.f32 %1, f476, f492;
add.f32 %0, f475, f491;
add.f32 %3, f480, f496;
add.f32 %2, f479, f495;
add.f32 %5, f484, f500;
add.f32 %4, f483, f499;
add.f32 %7, f488, f504;
add.f32 %6, f487, f503;
sub.f32 %9, f476, f492;
sub.f32 %8, f475, f491;
sub.f32 %11, f480, f496;
sub.f32 %10, f479, f495;
sub.f32 %13, f484, f500;
sub.f32 %12, f483, f499;
sub.f32 %15, f488, f504;
sub.f32 %14, f487, f503;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_1024), "l"(lut_sp_8_128), "l"(lut_sp_8_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<288, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<1092>;
.reg .b32 r<35>;
.reg .b64 rd<11>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 13;
mov.u32 r3, %32;
add.s32 r4, r3, r2;
add.f32 f65, %35, %51;
sub.f32 f67, %35, %51;
add.f32 f1084, %36, %67;
sub.f32 f68, %36, %67;
add.f32 f69, %43, %59;
sub.f32 f71, %43, %59;
add.f32 f1082, %68, %60;
sub.f32 f72, %68, %60;
add.f32 f73, f65, f69;
sub.f32 f75, f65, f69;
add.f32 f1081, f1084, f1082;
sub.f32 f76, f1084, f1082;
sub.f32 f77, f67, f72;
add.f32 f79, f67, f72;
add.f32 f1080, f68, f71;
sub.f32 f80, f68, f71;
add.f32 f81, %39, %55;
sub.f32 f83, %39, %55;
add.f32 f1077, %70, %69;
sub.f32 f84, %70, %69;
add.f32 f85, %47, %63;
sub.f32 f87, %47, %63;
add.f32 f1075, %48, %71;
sub.f32 f88, %48, %71;
add.f32 f89, f81, f85;
sub.f32 f91, f81, f85;
add.f32 f1074, f1077, f1075;
sub.f32 f92, f1077, f1075;
sub.f32 f93, f83, f88;
add.f32 f95, f83, f88;
add.f32 f1073, f84, f87;
sub.f32 f96, f84, f87;
mul.f32 f97, f93, 0f3F3504F3;
mul.f32 f98, f1073, 0f3F3504F3;
sub.f32 f99, f97, f98;
add.f32 f100, f97, f98;
mul.f32 f1071, f95, 0fBF3504F3;
mul.f32 f1072, f96, 0f3F3504F3;
sub.f32 f103, f1071, f1072;
mul.f32 f104, f96, 0fBF3504F3;
fma.rn.f32 f105, f95, 0f3F3504F3, f104;
add.f32 f106, f73, f89;
sub.f32 f108, f73, f89;
add.f32 f1070, f1081, f1074;
sub.f32 f109, f1081, f1074;
add.f32 f110, f77, f99;
sub.f32 f112, f77, f99;
add.f32 f1069, f1080, f100;
sub.f32 f113, f1080, f100;
sub.f32 f114, f75, f92;
add.f32 f116, f75, f92;
add.f32 f1068, f76, f91;
sub.f32 f117, f76, f91;
add.f32 f118, f79, f103;
sub.f32 f120, f79, f103;
add.f32 f1067, f80, f105;
sub.f32 f121, f80, f105;
add.f32 f122, %37, %53;
sub.f32 f124, %37, %53;
add.f32 f1065, %72, %54;
sub.f32 f125, %72, %54;
add.f32 f126, %45, %61;
sub.f32 f128, %45, %61;
add.f32 f1062, %73, %74;
sub.f32 f129, %73, %74;
add.f32 f130, f122, f126;
sub.f32 f132, f122, f126;
add.f32 f1061, f1065, f1062;
sub.f32 f133, f1065, f1062;
sub.f32 f134, f124, f129;
add.f32 f136, f124, f129;
add.f32 f1060, f125, f128;
sub.f32 f137, f125, f128;
add.f32 f138, %41, %57;
sub.f32 f140, %41, %57;
add.f32 f1058, %42, %75;
sub.f32 f141, %42, %75;
add.f32 f142, %49, %65;
sub.f32 f144, %49, %65;
add.f32 f1056, %76, %66;
sub.f32 f145, %76, %66;
add.f32 f146, f138, f142;
sub.f32 f148, f138, f142;
add.f32 f1055, f1058, f1056;
sub.f32 f149, f1058, f1056;
sub.f32 f150, f140, f145;
add.f32 f152, f140, f145;
add.f32 f1054, f141, f144;
sub.f32 f153, f141, f144;
mul.f32 f154, f150, 0f3F3504F3;
mul.f32 f155, f1054, 0f3F3504F3;
sub.f32 f156, f154, f155;
add.f32 f157, f154, f155;
mul.f32 f1052, f152, 0fBF3504F3;
mul.f32 f1053, f153, 0f3F3504F3;
sub.f32 f160, f1052, f1053;
mul.f32 f161, f153, 0fBF3504F3;
fma.rn.f32 f162, f152, 0f3F3504F3, f161;
add.f32 f163, f130, f146;
sub.f32 f165, f130, f146;
add.f32 f1051, f1061, f1055;
sub.f32 f166, f1061, f1055;
add.f32 f167, f134, f156;
sub.f32 f169, f134, f156;
add.f32 f1050, f1060, f157;
sub.f32 f170, f1060, f157;
sub.f32 f171, f132, f149;
add.f32 f173, f132, f149;
add.f32 f1049, f133, f148;
sub.f32 f174, f133, f148;
add.f32 f175, f136, f160;
sub.f32 f177, f136, f160;
add.f32 f1048, f137, f162;
sub.f32 f178, f137, f162;
mul.f32 f1046, f167, 0f3F6C835E;
mul.f32 f1047, f1050, 0f3EC3EF15;
sub.f32 f181, f1046, f1047;
mul.f32 f182, f1050, 0f3F6C835E;
fma.rn.f32 f183, f167, 0f3EC3EF15, f182;
mul.f32 f184, f171, 0f3F3504F3;
mul.f32 f185, f1049, 0f3F3504F3;
sub.f32 f186, f184, f185;
add.f32 f187, f184, f185;
mul.f32 f189, f1048, 0f3F6C835E;
mul.f32 f1045, f175, 0f3EC3EF15;
sub.f32 f190, f1045, f189;
mul.f32 f191, f1048, 0f3EC3EF15;
fma.rn.f32 f192, f175, 0f3F6C835E, f191;
mul.f32 f194, f170, 0f3F6C835E;
mul.f32 f1044, f169, 0fBEC3EF15;
sub.f32 f195, f1044, f194;
mul.f32 f196, f170, 0fBEC3EF15;
fma.rn.f32 f197, f169, 0f3F6C835E, f196;
mul.f32 f1042, f173, 0fBF3504F3;
mul.f32 f1043, f174, 0f3F3504F3;
sub.f32 f200, f1042, f1043;
mul.f32 f201, f174, 0fBF3504F3;
fma.rn.f32 f202, f173, 0f3F3504F3, f201;
mul.f32 f1040, f177, 0fBF6C835E;
mul.f32 f1041, f178, 0f3EC3EF15;
sub.f32 f205, f1040, f1041;
mul.f32 f206, f178, 0fBF6C835E;
fma.rn.f32 f207, f177, 0f3EC3EF15, f206;
add.f32 f210, f110, f181;
sub.f32 f212, f110, f181;
add.f32 f1039, f1069, f183;
sub.f32 f213, f1069, f183;
add.f32 f214, f114, f186;
sub.f32 f216, f114, f186;
add.f32 f1038, f1068, f187;
sub.f32 f217, f1068, f187;
add.f32 f218, f118, f190;
sub.f32 f220, f118, f190;
add.f32 f1037, f1067, f192;
sub.f32 f221, f1067, f192;
sub.f32 f222, f108, f166;
add.f32 f224, f108, f166;
add.f32 f1036, f109, f165;
sub.f32 f225, f109, f165;
add.f32 f226, f112, f195;
sub.f32 f228, f112, f195;
add.f32 f1035, f113, f197;
sub.f32 f229, f113, f197;
add.f32 f230, f116, f200;
sub.f32 f232, f116, f200;
add.f32 f1034, f117, f202;
sub.f32 f233, f117, f202;
add.f32 f234, f120, f205;
sub.f32 f236, f120, f205;
add.f32 f1033, f121, f207;
sub.f32 f237, f121, f207;
mov.u32 r22, %tid.x;
shl.b32 r7, r22, 7;
and.b32 r8, r7, -8192;
add.s32 r9, r4, r8;
shl.b32 r10, r22, 3;
cvt.u64.u32 rd2, r10;
and.b64 rd3, rd2, 504;
mov.u64 rd4, %33;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f238, f239}, [rd5];
mul.f32 f242, f1039, f239;
mul.f32 f244, f238, f1039;
mul.f32 f246, f239, f239;
mul.f32 f1032, f238, f238;
sub.f32 f247, f1032, f246;
mul.f32 f248, f239, f238;
fma.rn.f32 f249, f239, f238, f248;
mul.f32 f250, f1038, f249;
mul.f32 f252, f247, f1038;
mul.f32 f1030, f238, f247;
mul.f32 f1031, f239, f249;
sub.f32 f255, f1030, f1031;
mul.f32 f1029, f214, f249;
mul.f32 f256, f238, f249;
fma.rn.f32 f257, f239, f247, f256;
mul.f32 f258, f1037, f257;
mul.f32 f260, f255, f1037;
mul.f32 f262, f239, f257;
mul.f32 f1028, f238, f255;
sub.f32 f263, f1028, f262;
mul.f32 f1027, f218, f257;
mul.f32 f264, f238, f257;
fma.rn.f32 f265, f239, f255, f264;
mul.f32 f266, f1036, f265;
mul.f32 f268, f263, f1036;
mul.f32 f270, f239, f265;
mul.f32 f1026, f238, f263;
sub.f32 f271, f1026, f270;
mul.f32 f1025, f222, f265;
mul.f32 f272, f238, f265;
fma.rn.f32 f273, f239, f263, f272;
mul.f32 f274, f1035, f273;
mul.f32 f276, f271, f1035;
mul.f32 f1023, f238, f271;
mul.f32 f1024, f239, f273;
sub.f32 f279, f1023, f1024;
mul.f32 f1022, f226, f273;
mul.f32 f280, f238, f273;
fma.rn.f32 f281, f239, f271, f280;
mul.f32 f282, f1034, f281;
mul.f32 f284, f279, f1034;
mul.f32 f286, f239, f281;
mul.f32 f1021, f238, f279;
sub.f32 f287, f1021, f286;
mul.f32 f1020, f230, f281;
mul.f32 f288, f238, f281;
fma.rn.f32 f289, f239, f279, f288;
mul.f32 f290, f1033, f289;
mul.f32 f292, f287, f1033;
mul.f32 f294, f239, f289;
mul.f32 f1019, f238, f287;
sub.f32 f295, f1019, f294;
mul.f32 f1018, f234, f289;
mul.f32 f296, f238, f289;
fma.rn.f32 f297, f239, f287, f296;
sub.f32 f1017, f1070, f1051;
mul.f32 f298, f1017, f297;
mul.f32 f300, f295, f1017;
mul.f32 f1015, f238, f295;
mul.f32 f1016, f239, f297;
sub.f32 f303, f1015, f1016;
sub.f32 f1014, f106, f163;
mul.f32 f1013, f1014, f297;
mul.f32 f304, f238, f297;
fma.rn.f32 f305, f239, f295, f304;
mul.f32 f306, f213, f305;
mul.f32 f308, f303, f213;
mul.f32 f310, f239, f305;
mul.f32 f1012, f238, f303;
sub.f32 f311, f1012, f310;
mul.f32 f1011, f212, f305;
mul.f32 f312, f238, f305;
fma.rn.f32 f313, f239, f303, f312;
mul.f32 f314, f217, f313;
mul.f32 f316, f311, f217;
mul.f32 f1009, f238, f311;
mul.f32 f1010, f239, f313;
sub.f32 f319, f1009, f1010;
mul.f32 f1008, f216, f313;
mul.f32 f320, f238, f313;
fma.rn.f32 f321, f239, f311, f320;
mul.f32 f322, f221, f321;
mul.f32 f324, f319, f221;
mul.f32 f326, f239, f321;
mul.f32 f1007, f238, f319;
sub.f32 f327, f1007, f326;
mul.f32 f1006, f220, f321;
mul.f32 f328, f238, f321;
fma.rn.f32 f329, f239, f319, f328;
mul.f32 f330, f225, f329;
mul.f32 f332, f327, f225;
mul.f32 f334, f239, f329;
mul.f32 f1005, f238, f327;
sub.f32 f335, f1005, f334;
mul.f32 f1004, f224, f329;
mul.f32 f336, f238, f329;
fma.rn.f32 f337, f239, f327, f336;
mul.f32 f338, f229, f337;
mul.f32 f340, f335, f229;
mul.f32 f1002, f238, f335;
mul.f32 f1003, f239, f337;
sub.f32 f343, f1002, f1003;
mul.f32 f1001, f228, f337;
mul.f32 f344, f238, f337;
fma.rn.f32 f345, f239, f335, f344;
mul.f32 f346, f233, f345;
mul.f32 f348, f343, f233;
mul.f32 f350, f239, f345;
mul.f32 f1000, f238, f343;
sub.f32 f351, f1000, f350;
mul.f32 f999, f232, f345;
mul.f32 f352, f238, f345;
mul.f32 f998, f210, f239;
fma.rn.f32 f353, f239, f343, f352;
mul.f32 f354, f237, f353;
mul.f32 f355, f236, f353;
mul.f32 f356, f351, f237;
barrier.sync 0;
and.b32 r11, r7, 8064;
add.s32 r12, r9, r11;
add.f32 f357, f1070, f1051;
sub.f32 f1086, f106, f163;
add.f32 f358, f106, f163;
mov.u32 r34, %tid.x;
shl.b32 r28, r34, 7;
shl.b32 r24, r34, 3;
fma.rn.f32 f359, f238, f210, f242;
sub.f32 f360, f244, f998;
st.shared.v4.f32 [r12], {f358, f357, f359, f360};
fma.rn.f32 f361, f247, f214, f250;
sub.f32 f362, f252, f1029;
fma.rn.f32 f363, f255, f218, f258;
sub.f32 f364, f260, f1027;
st.shared.v4.f32 [r12+16], {f361, f362, f363, f364};
sub.f32 f365, f268, f1025;
fma.rn.f32 f366, f263, f222, f266;
fma.rn.f32 f367, f271, f226, f274;
sub.f32 f368, f276, f1022;
st.shared.v4.f32 [r12+32], {f366, f365, f367, f368};
fma.rn.f32 f369, f279, f230, f282;
sub.f32 f370, f284, f1020;
fma.rn.f32 f371, f287, f234, f290;
sub.f32 f372, f292, f1018;
st.shared.v4.f32 [r12+48], {f369, f370, f371, f372};
fma.rn.f32 f373, f295, f1086, f298;
sub.f32 f374, f300, f1013;
fma.rn.f32 f375, f303, f212, f306;
sub.f32 f376, f308, f1011;
st.shared.v4.f32 [r12+64], {f373, f374, f375, f376};
fma.rn.f32 f377, f311, f216, f314;
sub.f32 f378, f316, f1008;
fma.rn.f32 f379, f319, f220, f322;
sub.f32 f380, f324, f1006;
st.shared.v4.f32 [r12+80], {f377, f378, f379, f380};
fma.rn.f32 f381, f327, f224, f330;
sub.f32 f382, f332, f1004;
fma.rn.f32 f383, f335, f228, f338;
sub.f32 f384, f340, f1001;
st.shared.v4.f32 [r12+96], {f381, f382, f383, f384};
fma.rn.f32 f385, f343, f232, f346;
sub.f32 f386, f348, f999;
fma.rn.f32 f387, f351, f236, f354;
sub.f32 f388, f356, f355;
st.shared.v4.f32 [r12+112], {f385, f386, f387, f388};
barrier.sync 0;
and.b32 r21, r34, 63;
mad.lo.s32 r13, r21, -120, r12;
ld.shared.v2.f32 {f389, f390}, [r13];
ld.shared.v2.f32 {f393, f394}, [r13+512];
ld.shared.v2.f32 {f397, f398}, [r13+1024];
ld.shared.v2.f32 {f401, f402}, [r13+1536];
ld.shared.v2.f32 {f405, f406}, [r13+2048];
ld.shared.v2.f32 {f409, f410}, [r13+2560];
ld.shared.v2.f32 {f413, f414}, [r13+3072];
ld.shared.v2.f32 {f417, f418}, [r13+3584];
ld.shared.v2.f32 {f421, f422}, [r13+4096];
ld.shared.v2.f32 {f425, f426}, [r13+4608];
ld.shared.v2.f32 {f429, f430}, [r13+5120];
ld.shared.v2.f32 {f433, f434}, [r13+5632];
ld.shared.v2.f32 {f437, f438}, [r13+6144];
ld.shared.v2.f32 {f441, f442}, [r13+6656];
ld.shared.v2.f32 {f445, f446}, [r13+7168];
ld.shared.v2.f32 {f449, f450}, [r13+7680];
add.f32 f453, f389, f421;
sub.f32 f455, f389, f421;
add.f32 f997, f390, f422;
sub.f32 f456, f390, f422;
add.f32 f457, f405, f437;
sub.f32 f459, f405, f437;
add.f32 f996, f406, f438;
sub.f32 f460, f406, f438;
add.f32 f461, f453, f457;
sub.f32 f463, f453, f457;
add.f32 f995, f997, f996;
sub.f32 f464, f997, f996;
sub.f32 f465, f455, f460;
add.f32 f467, f455, f460;
add.f32 f994, f456, f459;
sub.f32 f468, f456, f459;
add.f32 f469, f397, f429;
sub.f32 f471, f397, f429;
add.f32 f993, f398, f430;
sub.f32 f472, f398, f430;
add.f32 f473, f413, f445;
sub.f32 f475, f413, f445;
add.f32 f992, f414, f446;
sub.f32 f476, f414, f446;
add.f32 f477, f469, f473;
sub.f32 f479, f469, f473;
add.f32 f991, f993, f992;
sub.f32 f480, f993, f992;
sub.f32 f481, f471, f476;
add.f32 f483, f471, f476;
add.f32 f990, f472, f475;
sub.f32 f484, f472, f475;
mul.f32 f485, f481, 0f3F3504F3;
mul.f32 f486, f990, 0f3F3504F3;
sub.f32 f487, f485, f486;
add.f32 f488, f485, f486;
mul.f32 f490, f484, 0f3F3504F3;
mul.f32 f989, f483, 0fBF3504F3;
sub.f32 f491, f989, f490;
mul.f32 f492, f484, 0fBF3504F3;
fma.rn.f32 f493, f483, 0f3F3504F3, f492;
add.f32 f494, f461, f477;
sub.f32 f496, f461, f477;
add.f32 f988, f995, f991;
sub.f32 f497, f995, f991;
add.f32 f498, f465, f487;
sub.f32 f500, f465, f487;
add.f32 f987, f994, f488;
sub.f32 f501, f994, f488;
sub.f32 f502, f463, f480;
add.f32 f504, f463, f480;
add.f32 f986, f464, f479;
sub.f32 f505, f464, f479;
add.f32 f506, f467, f491;
sub.f32 f508, f467, f491;
add.f32 f985, f468, f493;
sub.f32 f509, f468, f493;
add.f32 f510, f393, f425;
sub.f32 f512, f393, f425;
add.f32 f984, f394, f426;
sub.f32 f513, f394, f426;
add.f32 f514, f409, f441;
sub.f32 f516, f409, f441;
add.f32 f983, f410, f442;
sub.f32 f517, f410, f442;
add.f32 f518, f510, f514;
sub.f32 f520, f510, f514;
add.f32 f982, f984, f983;
sub.f32 f521, f984, f983;
sub.f32 f522, f512, f517;
add.f32 f524, f512, f517;
add.f32 f981, f513, f516;
sub.f32 f525, f513, f516;
add.f32 f526, f401, f433;
sub.f32 f528, f401, f433;
add.f32 f980, f402, f434;
sub.f32 f529, f402, f434;
add.f32 f530, f417, f449;
sub.f32 f532, f417, f449;
add.f32 f979, f418, f450;
sub.f32 f533, f418, f450;
add.f32 f534, f526, f530;
sub.f32 f536, f526, f530;
add.f32 f978, f980, f979;
sub.f32 f537, f980, f979;
sub.f32 f538, f528, f533;
add.f32 f540, f528, f533;
add.f32 f977, f529, f532;
sub.f32 f541, f529, f532;
mul.f32 f542, f538, 0f3F3504F3;
mul.f32 f543, f977, 0f3F3504F3;
sub.f32 f544, f542, f543;
add.f32 f545, f542, f543;
mul.f32 f547, f541, 0f3F3504F3;
mul.f32 f976, f540, 0fBF3504F3;
sub.f32 f548, f976, f547;
mul.f32 f549, f541, 0fBF3504F3;
fma.rn.f32 f550, f540, 0f3F3504F3, f549;
add.f32 f551, f518, f534;
sub.f32 f553, f518, f534;
add.f32 f975, f982, f978;
sub.f32 f554, f982, f978;
add.f32 f555, f522, f544;
sub.f32 f557, f522, f544;
add.f32 f974, f981, f545;
sub.f32 f558, f981, f545;
sub.f32 f559, f520, f537;
add.f32 f561, f520, f537;
add.f32 f973, f521, f536;
sub.f32 f562, f521, f536;
add.f32 f563, f524, f548;
sub.f32 f565, f524, f548;
add.f32 f972, f525, f550;
sub.f32 f566, f525, f550;
mul.f32 f568, f974, 0f3EC3EF15;
mul.f32 f971, f555, 0f3F6C835E;
sub.f32 f569, f971, f568;
mul.f32 f570, f974, 0f3F6C835E;
fma.rn.f32 f571, f555, 0f3EC3EF15, f570;
mul.f32 f572, f559, 0f3F3504F3;
mul.f32 f573, f973, 0f3F3504F3;
sub.f32 f574, f572, f573;
add.f32 f575, f572, f573;
mul.f32 f577, f972, 0f3F6C835E;
mul.f32 f970, f563, 0f3EC3EF15;
sub.f32 f578, f970, f577;
mul.f32 f579, f972, 0f3EC3EF15;
fma.rn.f32 f580, f563, 0f3F6C835E, f579;
mul.f32 f582, f558, 0f3F6C835E;
mul.f32 f969, f557, 0fBEC3EF15;
sub.f32 f583, f969, f582;
mul.f32 f584, f558, 0fBEC3EF15;
fma.rn.f32 f585, f557, 0f3F6C835E, f584;
mul.f32 f587, f562, 0f3F3504F3;
mul.f32 f968, f561, 0fBF3504F3;
sub.f32 f588, f968, f587;
mul.f32 f589, f562, 0fBF3504F3;
fma.rn.f32 f590, f561, 0f3F3504F3, f589;
mul.f32 f592, f566, 0f3EC3EF15;
mul.f32 f967, f565, 0fBF6C835E;
sub.f32 f593, f967, f592;
mul.f32 f594, f566, 0fBF6C835E;
fma.rn.f32 f595, f565, 0f3EC3EF15, f594;
add.f32 f598, f498, f569;
sub.f32 f600, f498, f569;
add.f32 f966, f987, f571;
sub.f32 f601, f987, f571;
add.f32 f602, f502, f574;
sub.f32 f604, f502, f574;
add.f32 f965, f986, f575;
sub.f32 f605, f986, f575;
add.f32 f606, f506, f578;
sub.f32 f608, f506, f578;
add.f32 f964, f985, f580;
sub.f32 f609, f985, f580;
sub.f32 f610, f496, f554;
add.f32 f612, f496, f554;
add.f32 f963, f497, f553;
sub.f32 f613, f497, f553;
add.f32 f614, f500, f583;
sub.f32 f616, f500, f583;
add.f32 f962, f501, f585;
sub.f32 f617, f501, f585;
add.f32 f618, f504, f588;
sub.f32 f620, f504, f588;
add.f32 f961, f505, f590;
sub.f32 f621, f505, f590;
add.f32 f622, f508, f593;
sub.f32 f624, f508, f593;
add.f32 f960, f509, f595;
sub.f32 f625, f509, f595;
and.b32 r14, r34, 48;
bfe.u32 r15, r34, 4, 2;
mul.wide.u32 rd6, r15, 8;
mov.u64 rd7, %34;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f626, f627}, [rd8];
mul.f32 f630, f966, f627;
mul.f32 f632, f626, f966;
mul.f32 f634, f627, f627;
mul.f32 f959, f626, f626;
sub.f32 f635, f959, f634;
mul.f32 f636, f627, f626;
fma.rn.f32 f637, f627, f626, f636;
mul.f32 f638, f965, f637;
mul.f32 f640, f635, f965;
mul.f32 f957, f626, f635;
mul.f32 f958, f627, f637;
sub.f32 f643, f957, f958;
mul.f32 f956, f602, f637;
mul.f32 f644, f626, f637;
fma.rn.f32 f645, f627, f635, f644;
mul.f32 f646, f964, f645;
mul.f32 f648, f643, f964;
mul.f32 f650, f627, f645;
mul.f32 f955, f626, f643;
sub.f32 f651, f955, f650;
mul.f32 f954, f606, f645;
mul.f32 f652, f626, f645;
fma.rn.f32 f653, f627, f643, f652;
mul.f32 f654, f963, f653;
mul.f32 f656, f651, f963;
mul.f32 f658, f627, f653;
mul.f32 f953, f626, f651;
sub.f32 f659, f953, f658;
mul.f32 f952, f610, f653;
mul.f32 f660, f626, f653;
fma.rn.f32 f661, f627, f651, f660;
mul.f32 f662, f962, f661;
mul.f32 f664, f659, f962;
mul.f32 f950, f626, f659;
mul.f32 f951, f627, f661;
sub.f32 f667, f950, f951;
mul.f32 f949, f614, f661;
mul.f32 f668, f626, f661;
fma.rn.f32 f669, f627, f659, f668;
mul.f32 f670, f961, f669;
mul.f32 f672, f667, f961;
mul.f32 f674, f627, f669;
mul.f32 f948, f626, f667;
sub.f32 f675, f948, f674;
mul.f32 f947, f618, f669;
mul.f32 f676, f626, f669;
fma.rn.f32 f677, f627, f667, f676;
mul.f32 f678, f960, f677;
mul.f32 f680, f675, f960;
mul.f32 f682, f627, f677;
mul.f32 f946, f626, f675;
sub.f32 f683, f946, f682;
mul.f32 f945, f622, f677;
mul.f32 f684, f626, f677;
fma.rn.f32 f685, f627, f675, f684;
sub.f32 f944, f988, f975;
mul.f32 f686, f944, f685;
mul.f32 f688, f683, f944;
mul.f32 f942, f626, f683;
mul.f32 f943, f627, f685;
sub.f32 f691, f942, f943;
sub.f32 f941, f494, f551;
mul.f32 f940, f941, f685;
mul.f32 f692, f626, f685;
fma.rn.f32 f693, f627, f683, f692;
mul.f32 f694, f601, f693;
mul.f32 f696, f691, f601;
mul.f32 f698, f627, f693;
mul.f32 f939, f626, f691;
sub.f32 f699, f939, f698;
mul.f32 f938, f600, f693;
mul.f32 f700, f626, f693;
fma.rn.f32 f701, f627, f691, f700;
mul.f32 f702, f605, f701;
mul.f32 f704, f699, f605;
mul.f32 f936, f626, f699;
mul.f32 f937, f627, f701;
sub.f32 f707, f936, f937;
mul.f32 f935, f604, f701;
mul.f32 f708, f626, f701;
fma.rn.f32 f709, f627, f699, f708;
mul.f32 f710, f609, f709;
mul.f32 f712, f707, f609;
mul.f32 f714, f627, f709;
mul.f32 f934, f626, f707;
sub.f32 f715, f934, f714;
mul.f32 f933, f608, f709;
mul.f32 f716, f626, f709;
fma.rn.f32 f717, f627, f707, f716;
mul.f32 f718, f613, f717;
mul.f32 f720, f715, f613;
mul.f32 f722, f627, f717;
mul.f32 f932, f626, f715;
sub.f32 f723, f932, f722;
mul.f32 f931, f612, f717;
mul.f32 f724, f626, f717;
fma.rn.f32 f725, f627, f715, f724;
mul.f32 f726, f617, f725;
mul.f32 f728, f723, f617;
mul.f32 f929, f626, f723;
mul.f32 f930, f627, f725;
sub.f32 f731, f929, f930;
mul.f32 f928, f616, f725;
mul.f32 f732, f626, f725;
fma.rn.f32 f733, f627, f723, f732;
mul.f32 f734, f621, f733;
mul.f32 f736, f731, f621;
mul.f32 f738, f627, f733;
mul.f32 f927, f626, f731;
sub.f32 f739, f927, f738;
mul.f32 f926, f620, f733;
mul.f32 f740, f626, f733;
mul.f32 f925, f598, f627;
fma.rn.f32 f741, f627, f731, f740;
mul.f32 f742, f625, f741;
mul.f32 f743, f624, f741;
mul.f32 f744, f739, f625;
and.b32 r16, r24, 120;
add.s32 r17, r9, r16;
sub.f32 f1088, f988, f975;
mul.f32 f1087, f683, f1088;
barrier.sync 0;
and.b32 r18, r28, 6144;
add.s32 r19, r17, r18;
sub.f32 f1090, f988, f975;
mul.f32 f1089, f683, f1090;
add.f32 f745, f988, f975;
sub.f32 f1091, f494, f551;
add.f32 f746, f494, f551;
st.shared.v2.f32 [r19], {f746, f745};
mov.u32 r27, %tid.x;
and.b32 r26, r27, 48;
fma.rn.f32 f747, f626, f598, f630;
sub.f32 f748, f632, f925;
st.shared.v2.f32 [r19+128], {f747, f748};
fma.rn.f32 f749, f635, f602, f638;
sub.f32 f750, f640, f956;
st.shared.v2.f32 [r19+256], {f749, f750};
fma.rn.f32 f751, f643, f606, f646;
sub.f32 f752, f648, f954;
st.shared.v2.f32 [r19+384], {f751, f752};
fma.rn.f32 f753, f651, f610, f654;
sub.f32 f754, f656, f952;
st.shared.v2.f32 [r19+512], {f753, f754};
sub.f32 f755, f664, f949;
fma.rn.f32 f756, f659, f614, f662;
st.shared.v2.f32 [r19+640], {f756, f755};
fma.rn.f32 f757, f667, f618, f670;
sub.f32 f758, f672, f947;
st.shared.v2.f32 [r19+768], {f757, f758};
fma.rn.f32 f759, f675, f622, f678;
sub.f32 f760, f680, f945;
st.shared.v2.f32 [r19+896], {f759, f760};
fma.rn.f32 f761, f683, f1091, f686;
sub.f32 f762, f1089, f940;
st.shared.v2.f32 [r19+1024], {f761, f762};
fma.rn.f32 f763, f691, f600, f694;
sub.f32 f764, f696, f938;
st.shared.v2.f32 [r19+1152], {f763, f764};
fma.rn.f32 f765, f699, f604, f702;
sub.f32 f766, f704, f935;
st.shared.v2.f32 [r19+1280], {f765, f766};
fma.rn.f32 f767, f707, f608, f710;
sub.f32 f768, f712, f933;
st.shared.v2.f32 [r19+1408], {f767, f768};
fma.rn.f32 f769, f715, f612, f718;
sub.f32 f770, f720, f931;
st.shared.v2.f32 [r19+1536], {f769, f770};
fma.rn.f32 f771, f723, f616, f726;
sub.f32 f772, f728, f928;
st.shared.v2.f32 [r19+1664], {f771, f772};
fma.rn.f32 f773, f731, f620, f734;
sub.f32 f774, f736, f926;
st.shared.v2.f32 [r19+1792], {f773, f774};
fma.rn.f32 f775, f739, f624, f742;
sub.f32 f776, f744, f743;
st.shared.v2.f32 [r19+1920], {f775, f776};
barrier.sync 0;
mad.lo.s32 r20, r26, -120, r19;
ld.shared.v2.f32 {f777, f778}, [r20];
ld.shared.v2.f32 {f781, f782}, [r20+512];
ld.shared.v2.f32 {f785, f786}, [r20+1024];
ld.shared.v2.f32 {f789, f790}, [r20+1536];
ld.shared.v2.f32 {f793, f794}, [r20+2048];
ld.shared.v2.f32 {f797, f798}, [r20+2560];
ld.shared.v2.f32 {f801, f802}, [r20+3072];
ld.shared.v2.f32 {f805, f806}, [r20+3584];
ld.shared.v2.f32 {f809, f810}, [r20+4096];
ld.shared.v2.f32 {f813, f814}, [r20+4608];
ld.shared.v2.f32 {f817, f818}, [r20+5120];
ld.shared.v2.f32 {f821, f822}, [r20+5632];
ld.shared.v2.f32 {f825, f826}, [r20+6144];
ld.shared.v2.f32 {f829, f830}, [r20+6656];
ld.shared.v2.f32 {f833, f834}, [r20+7168];
ld.shared.v2.f32 {f837, f838}, [r20+7680];
add.f32 f841, f777, f809;
sub.f32 f843, f777, f809;
add.f32 f924, f778, f810;
sub.f32 f844, f778, f810;
add.f32 f845, f793, f825;
sub.f32 f847, f793, f825;
add.f32 f923, f794, f826;
sub.f32 f848, f794, f826;
add.f32 f849, f781, f813;
sub.f32 f851, f781, f813;
add.f32 f922, f782, f814;
sub.f32 f852, f782, f814;
add.f32 f853, f797, f829;
sub.f32 f855, f797, f829;
add.f32 f921, f798, f830;
sub.f32 f856, f798, f830;
add.f32 f857, f785, f817;
sub.f32 f859, f785, f817;
add.f32 f920, f786, f818;
sub.f32 f860, f786, f818;
add.f32 f861, f801, f833;
sub.f32 f863, f801, f833;
add.f32 f919, f802, f834;
sub.f32 f864, f802, f834;
add.f32 f865, f789, f821;
sub.f32 f867, f789, f821;
add.f32 f918, f790, f822;
sub.f32 f868, f790, f822;
add.f32 f869, f805, f837;
sub.f32 f871, f805, f837;
add.f32 f917, f806, f838;
sub.f32 f872, f806, f838;
add.f32 %1, f924, f923;
add.f32 %0, f841, f845;
add.f32 %2, f849, f853;
add.f32 %3, f922, f921;
add.f32 %4, f857, f861;
add.f32 %5, f920, f919;
add.f32 %6, f865, f869;
add.f32 %7, f918, f917;
sub.f32 %8, f843, f848;
add.f32 %9, f844, f847;
add.f32 %11, f852, f855;
sub.f32 %10, f851, f856;
add.f32 %13, f860, f863;
sub.f32 %12, f859, f864;
add.f32 %15, f868, f871;
sub.f32 %14, f867, f872;
sub.f32 %17, f924, f923;
sub.f32 %16, f841, f845;
sub.f32 %19, f922, f921;
sub.f32 %18, f849, f853;
sub.f32 %21, f920, f919;
sub.f32 %20, f857, f861;
sub.f32 %23, f918, f917;
sub.f32 %22, f865, f869;
sub.f32 %25, f844, f847;
add.f32 %24, f843, f848;
sub.f32 %27, f852, f855;
add.f32 %26, f851, f856;
sub.f32 %29, f860, f863;
add.f32 %28, f859, f864;
sub.f32 %31, f868, f871;
add.f32 %30, f867, f872;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_1024), "l"(lut_sp_16_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[10].y), "f"(rmem[2].y), "f"(rmem[14].y), "f"(rmem[1].y), "f"(rmem[5].y), "f"(rmem[13].y), "f"(rmem[11].y), "f"(rmem[7].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<289, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<277>;
.reg .b32 r<35>;
.reg .b64 rd<15>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 13;
mov.u32 r3, %8;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f32 f17, %13, %18;
add.f32 f18, %14, %20;
sub.f32 f19, %13, %18;
sub.f32 f20, %14, %20;
add.f32 f21, %15, %21;
add.f32 f22, %17, %22;
sub.f32 f23, %15, %21;
sub.f32 f24, %17, %22;
sub.f32 f25, f17, f21;
sub.f32 f26, f18, f22;
sub.f32 f27, f19, f24;
add.f32 f28, f20, f23;
add.f32 f29, f19, f24;
sub.f32 f30, f20, f23;
and.b32 r6, r5, 255;
shl.b32 r7, r5, 5;
and.b32 r8, r7, -8192;
add.s32 r9, r4, r8;
shl.b32 r10, r5, 3;
cvt.u64.u32 rd2, r10;
and.b64 rd3, rd2, 2040;
mov.u64 rd4, %9;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f31, f32}, [rd5];
mul.f32 f35, f28, f32;
mul.f32 f36, f27, f32;
mul.f32 f37, f31, f28;
mul.f32 f38, f31, f31;
mul.f32 f39, f32, f32;
sub.f32 f40, f38, f39;
mul.f32 f41, f32, f31;
fma.rn.f32 f42, f32, f31, f41;
mul.f32 f43, f26, f42;
mul.f32 f44, f25, f42;
mul.f32 f45, f40, f26;
mul.f32 f46, f31, f40;
mul.f32 f47, f32, f42;
sub.f32 f48, f46, f47;
mul.f32 f49, f31, f42;
fma.rn.f32 f50, f32, f40, f49;
mul.f32 f51, f30, f50;
mul.f32 f52, f29, f50;
mul.f32 f53, f48, f30;
barrier.sync 0;
and.b32 r11, r7, 8160;
add.s32 r12, r9, r11;
add.f32 f54, f18, f22;
add.f32 f55, f17, f21;
fma.rn.f32 f56, f31, f27, f35;
sub.f32 f57, f37, f36;
st.shared.v4.f32 [r12], {f55, f54, f56, f57};
sub.f32 f58, f45, f44;
fma.rn.f32 f59, f40, f25, f43;
fma.rn.f32 f60, f48, f29, f51;
sub.f32 f61, f53, f52;
st.shared.v4.f32 [r12+16], {f59, f58, f60, f61};
barrier.sync 0;
mad.lo.s32 r13, r6, -24, r12;
ld.shared.v2.f32 {f62, f63}, [r13];
ld.shared.v2.f32 {f66, f67}, [r13+2048];
ld.shared.v2.f32 {f70, f71}, [r13+4096];
ld.shared.v2.f32 {f74, f75}, [r13+6144];
add.f32 f78, f62, f70;
add.f32 f79, f63, f71;
sub.f32 f80, f62, f70;
sub.f32 f81, f63, f71;
add.f32 f82, f66, f74;
add.f32 f83, f67, f75;
sub.f32 f84, f66, f74;
sub.f32 f85, f67, f75;
sub.f32 f86, f78, f82;
sub.f32 f87, f79, f83;
sub.f32 f88, f80, f85;
add.f32 f89, f81, f84;
add.f32 f90, f80, f85;
sub.f32 f91, f81, f84;
and.b32 r14, r5, 252;
bfe.u32 r15, r5, 2, 6;
mul.wide.u32 rd6, r15, 8;
mov.u64 rd7, %10;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f92, f93}, [rd8];
mul.f32 f96, f89, f93;
mul.f32 f97, f88, f93;
mul.f32 f98, f92, f89;
mul.f32 f99, f92, f92;
mul.f32 f100, f93, f93;
sub.f32 f101, f99, f100;
mul.f32 f102, f93, f92;
fma.rn.f32 f103, f93, f92, f102;
mul.f32 f104, f87, f103;
mul.f32 f105, f86, f103;
mul.f32 f106, f101, f87;
mul.f32 f107, f92, f101;
mul.f32 f108, f93, f103;
sub.f32 f109, f107, f108;
mul.f32 f110, f92, f103;
fma.rn.f32 f111, f93, f101, f110;
mul.f32 f112, f91, f111;
mul.f32 f113, f90, f111;
mul.f32 f114, f109, f91;
and.b32 r16, r10, 24;
add.s32 r17, r9, r16;
barrier.sync 0;
and.b32 r18, r7, 8064;
add.s32 r19, r17, r18;
add.f32 f115, f79, f83;
add.f32 f116, f78, f82;
st.shared.v2.f32 [r19], {f116, f115};
fma.rn.f32 f117, f92, f88, f96;
sub.f32 f118, f98, f97;
st.shared.v2.f32 [r19+32], {f117, f118};
fma.rn.f32 f119, f101, f86, f104;
sub.f32 f120, f106, f105;
st.shared.v2.f32 [r19+64], {f119, f120};
sub.f32 f121, f114, f113;
fma.rn.f32 f122, f109, f90, f112;
st.shared.v2.f32 [r19+96], {f122, f121};
barrier.sync 0;
mad.lo.s32 r20, r14, -24, r19;
ld.shared.v2.f32 {f123, f124}, [r20];
ld.shared.v2.f32 {f127, f128}, [r20+2048];
ld.shared.v2.f32 {f131, f132}, [r20+4096];
ld.shared.v2.f32 {f135, f136}, [r20+6144];
add.f32 f139, f123, f131;
add.f32 f140, f124, f132;
sub.f32 f141, f123, f131;
sub.f32 f142, f124, f132;
add.f32 f143, f127, f135;
add.f32 f144, f128, f136;
sub.f32 f145, f127, f135;
sub.f32 f146, f128, f136;
sub.f32 f147, f139, f143;
sub.f32 f148, f140, f144;
sub.f32 f149, f141, f146;
add.f32 f150, f142, f145;
add.f32 f151, f141, f146;
sub.f32 f152, f142, f145;
and.b32 r21, r5, 240;
bfe.u32 r22, r5, 4, 4;
mul.wide.u32 rd9, r22, 8;
mov.u64 rd10, %11;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f153, f154}, [rd11];
mul.f32 f157, f150, f154;
mul.f32 f158, f149, f154;
mul.f32 f159, f153, f150;
mul.f32 f160, f153, f153;
mul.f32 f161, f154, f154;
sub.f32 f162, f160, f161;
mul.f32 f163, f154, f153;
fma.rn.f32 f164, f154, f153, f163;
mul.f32 f165, f148, f164;
mul.f32 f166, f147, f164;
mul.f32 f167, f162, f148;
mul.f32 f168, f153, f162;
mul.f32 f169, f154, f164;
sub.f32 f170, f168, f169;
mul.f32 f171, f153, f164;
fma.rn.f32 f172, f154, f162, f171;
mul.f32 f173, f152, f172;
mul.f32 f174, f151, f172;
mul.f32 f175, f170, f152;
and.b32 r23, r10, 120;
add.s32 r24, r9, r23;
barrier.sync 0;
and.b32 r25, r7, 7680;
add.s32 r26, r24, r25;
add.f32 f176, f140, f144;
add.f32 f177, f139, f143;
st.shared.v2.f32 [r26], {f177, f176};
fma.rn.f32 f178, f153, f149, f157;
sub.f32 f179, f159, f158;
st.shared.v2.f32 [r26+128], {f178, f179};
fma.rn.f32 f180, f162, f147, f165;
sub.f32 f181, f167, f166;
st.shared.v2.f32 [r26+256], {f180, f181};
sub.f32 f182, f175, f174;
fma.rn.f32 f183, f170, f151, f173;
st.shared.v2.f32 [r26+384], {f183, f182};
barrier.sync 0;
mad.lo.s32 r27, r21, -24, r26;
ld.shared.v2.f32 {f184, f185}, [r27];
ld.shared.v2.f32 {f188, f189}, [r27+2048];
ld.shared.v2.f32 {f192, f193}, [r27+4096];
ld.shared.v2.f32 {f196, f197}, [r27+6144];
add.f32 f200, f184, f192;
add.f32 f201, f185, f193;
sub.f32 f202, f184, f192;
sub.f32 f203, f185, f193;
add.f32 f204, f188, f196;
add.f32 f205, f189, f197;
sub.f32 f206, f188, f196;
sub.f32 f207, f189, f197;
sub.f32 f208, f200, f204;
sub.f32 f209, f201, f205;
sub.f32 f210, f202, f207;
add.f32 f211, f203, f206;
add.f32 f212, f202, f207;
sub.f32 f213, f203, f206;
and.b32 r28, r5, 192;
bfe.u32 r29, r5, 6, 2;
mul.wide.u32 rd12, r29, 8;
mov.u64 rd13, %12;
add.s64 rd14, rd13, rd12;
ld.global.v2.f32 {f214, f215}, [rd14];
mul.f32 f218, f211, f215;
mul.f32 f219, f210, f215;
mul.f32 f220, f214, f211;
mul.f32 f221, f214, f214;
mul.f32 f222, f215, f215;
sub.f32 f223, f221, f222;
mul.f32 f224, f215, f214;
fma.rn.f32 f225, f215, f214, f224;
mul.f32 f226, f209, f225;
mul.f32 f227, f208, f225;
mul.f32 f228, f223, f209;
mul.f32 f229, f214, f223;
mul.f32 f230, f215, f225;
sub.f32 f231, f229, f230;
mul.f32 f232, f214, f225;
fma.rn.f32 f233, f215, f223, f232;
mul.f32 f234, f213, f233;
mul.f32 f235, f212, f233;
mul.f32 f236, f231, f213;
and.b32 r30, r10, 504;
add.s32 r31, r9, r30;
barrier.sync 0;
and.b32 r32, r7, 6144;
add.s32 r33, r31, r32;
add.f32 f237, f201, f205;
add.f32 f238, f200, f204;
st.shared.v2.f32 [r33], {f238, f237};
fma.rn.f32 f239, f214, f210, f218;
sub.f32 f240, f220, f219;
st.shared.v2.f32 [r33+512], {f239, f240};
fma.rn.f32 f241, f223, f208, f226;
sub.f32 f242, f228, f227;
st.shared.v2.f32 [r33+1024], {f241, f242};
sub.f32 f243, f236, f235;
fma.rn.f32 f244, f231, f212, f234;
st.shared.v2.f32 [r33+1536], {f244, f243};
barrier.sync 0;
mad.lo.s32 r34, r28, -24, r33;
ld.shared.v2.f32 {f245, f246}, [r34];
ld.shared.v2.f32 {f249, f250}, [r34+2048];
ld.shared.v2.f32 {f253, f254}, [r34+4096];
ld.shared.v2.f32 {f257, f258}, [r34+6144];
add.f32 f261, f245, f253;
add.f32 f262, f246, f254;
sub.f32 f263, f245, f253;
sub.f32 f264, f246, f254;
add.f32 f265, f249, f257;
add.f32 f266, f250, f258;
sub.f32 f267, f249, f257;
sub.f32 f268, f250, f258;
add.f32 %1, f262, f266;
add.f32 %0, f261, f265;
add.f32 %3, f264, f267;
sub.f32 %2, f263, f268;
sub.f32 %5, f262, f266;
sub.f32 %4, f261, f265;
sub.f32 %7, f264, f267;
add.f32 %6, f263, f268;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_1024), "l"(lut_sp_4_256), "l"(lut_sp_4_64), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<290, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<245>;
.reg .b32 r<36>;
.reg .b64 rd<15>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 12;
mov.u32 r3, %8;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f32 f17, %13, %18;
add.f32 f18, %14, %20;
sub.f32 f19, %13, %18;
sub.f32 f20, %14, %20;
add.f32 f21, %15, %21;
add.f32 f22, %17, %22;
sub.f32 f23, %15, %21;
sub.f32 f24, %17, %22;
add.f32 f25, f17, f21;
add.f32 f26, f18, f22;
sub.f32 f27, f17, f21;
sub.f32 f28, f18, f22;
sub.f32 f29, f19, f24;
add.f32 f30, f20, f23;
add.f32 f31, f19, f24;
sub.f32 f32, f20, f23;
and.b32 r6, r5, 255;
shl.b32 r7, r5, 3;
cvt.u64.u32 rd2, r7;
and.b64 rd3, rd2, 2040;
mov.u64 rd4, %9;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f33, f34}, [rd5];
mul.f32 f37, f30, f34;
fma.rn.f32 f38, f33, f29, f37;
mul.f32 f39, f29, f34;
mul.f32 f40, f33, f30;
sub.f32 f41, f40, f39;
mul.f32 f42, f33, f33;
mul.f32 f43, f34, f34;
sub.f32 f44, f42, f43;
mul.f32 f45, f34, f33;
fma.rn.f32 f46, f34, f33, f45;
mul.f32 f47, f28, f46;
fma.rn.f32 f48, f44, f27, f47;
mul.f32 f49, f27, f46;
mul.f32 f50, f44, f28;
sub.f32 f51, f50, f49;
mul.f32 f52, f33, f44;
mul.f32 f53, f34, f46;
sub.f32 f54, f52, f53;
mul.f32 f55, f33, f46;
fma.rn.f32 f56, f34, f44, f55;
mul.f32 f57, f32, f56;
fma.rn.f32 f58, f54, f31, f57;
mul.f32 f59, f31, f56;
mul.f32 f60, f54, f32;
sub.f32 f61, f60, f59;
shl.b32 r8, r5, 4;
and.b32 r9, r8, -4096;
add.s32 r10, r4, r9;
barrier.sync 0;
and.b32 r11, r8, 4080;
add.s32 r12, r10, r11;
st.shared.v4.f32 [r12], {f25, f38, f48, f58};
barrier.sync 0;
mad.lo.s32 r13, r6, -12, r12;
ld.shared.f32 f62, [r13];
ld.shared.f32 f63, [r13+1024];
ld.shared.f32 f64, [r13+2048];
ld.shared.f32 f65, [r13+3072];
barrier.sync 0;
st.shared.v4.f32 [r12], {f26, f41, f51, f61};
barrier.sync 0;
ld.shared.f32 f66, [r13];
ld.shared.f32 f67, [r13+1024];
ld.shared.f32 f68, [r13+2048];
ld.shared.f32 f69, [r13+3072];
add.f32 f70, f62, f64;
add.f32 f71, f66, f68;
sub.f32 f72, f62, f64;
sub.f32 f73, f66, f68;
add.f32 f74, f63, f65;
add.f32 f75, f67, f69;
sub.f32 f76, f63, f65;
sub.f32 f77, f67, f69;
add.f32 f78, f70, f74;
add.f32 f79, f71, f75;
sub.f32 f80, f70, f74;
sub.f32 f81, f71, f75;
sub.f32 f82, f72, f77;
add.f32 f83, f73, f76;
add.f32 f84, f72, f77;
sub.f32 f85, f73, f76;
and.b32 r14, r5, 252;
bfe.u32 r15, r5, 2, 6;
mul.wide.u32 rd6, r15, 8;
mov.u64 rd7, %10;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f86, f87}, [rd8];
mul.f32 f90, f83, f87;
fma.rn.f32 f91, f86, f82, f90;
mul.f32 f92, f82, f87;
mul.f32 f93, f86, f83;
sub.f32 f94, f93, f92;
mul.f32 f95, f86, f86;
mul.f32 f96, f87, f87;
sub.f32 f97, f95, f96;
mul.f32 f98, f87, f86;
fma.rn.f32 f99, f87, f86, f98;
mul.f32 f100, f81, f99;
fma.rn.f32 f101, f97, f80, f100;
mul.f32 f102, f80, f99;
mul.f32 f103, f97, f81;
sub.f32 f104, f103, f102;
mul.f32 f105, f86, f97;
mul.f32 f106, f87, f99;
sub.f32 f107, f105, f106;
mul.f32 f108, f86, f99;
fma.rn.f32 f109, f87, f97, f108;
mul.f32 f110, f85, f109;
fma.rn.f32 f111, f107, f84, f110;
mul.f32 f112, f84, f109;
mul.f32 f113, f107, f85;
sub.f32 f114, f113, f112;
shl.b32 r16, r5, 2;
and.b32 r17, r16, 12;
add.s32 r18, r10, r17;
barrier.sync 0;
and.b32 r19, r8, 4032;
add.s32 r20, r18, r19;
st.shared.f32 [r20], f78;
st.shared.f32 [r20+16], f91;
st.shared.f32 [r20+32], f101;
st.shared.f32 [r20+48], f111;
barrier.sync 0;
mad.lo.s32 r21, r14, -12, r20;
ld.shared.f32 f115, [r21];
ld.shared.f32 f116, [r21+1024];
ld.shared.f32 f117, [r21+2048];
ld.shared.f32 f118, [r21+3072];
barrier.sync 0;
st.shared.f32 [r20], f79;
st.shared.f32 [r20+16], f94;
st.shared.f32 [r20+32], f104;
st.shared.f32 [r20+48], f114;
barrier.sync 0;
ld.shared.f32 f119, [r21];
ld.shared.f32 f120, [r21+1024];
ld.shared.f32 f121, [r21+2048];
ld.shared.f32 f122, [r21+3072];
add.f32 f123, f115, f117;
add.f32 f124, f119, f121;
sub.f32 f125, f115, f117;
sub.f32 f126, f119, f121;
add.f32 f127, f116, f118;
add.f32 f128, f120, f122;
sub.f32 f129, f116, f118;
sub.f32 f130, f120, f122;
add.f32 f131, f123, f127;
add.f32 f132, f124, f128;
sub.f32 f133, f123, f127;
sub.f32 f134, f124, f128;
sub.f32 f135, f125, f130;
add.f32 f136, f126, f129;
add.f32 f137, f125, f130;
sub.f32 f138, f126, f129;
and.b32 r22, r5, 240;
bfe.u32 r23, r5, 4, 4;
mul.wide.u32 rd9, r23, 8;
mov.u64 rd10, %11;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f139, f140}, [rd11];
mul.f32 f143, f136, f140;
fma.rn.f32 f144, f139, f135, f143;
mul.f32 f145, f135, f140;
mul.f32 f146, f139, f136;
sub.f32 f147, f146, f145;
mul.f32 f148, f139, f139;
mul.f32 f149, f140, f140;
sub.f32 f150, f148, f149;
mul.f32 f151, f140, f139;
fma.rn.f32 f152, f140, f139, f151;
mul.f32 f153, f134, f152;
fma.rn.f32 f154, f150, f133, f153;
mul.f32 f155, f133, f152;
mul.f32 f156, f150, f134;
sub.f32 f157, f156, f155;
mul.f32 f158, f139, f150;
mul.f32 f159, f140, f152;
sub.f32 f160, f158, f159;
mul.f32 f161, f139, f152;
fma.rn.f32 f162, f140, f150, f161;
mul.f32 f163, f138, f162;
fma.rn.f32 f164, f160, f137, f163;
mul.f32 f165, f137, f162;
mul.f32 f166, f160, f138;
sub.f32 f167, f166, f165;
and.b32 r24, r16, 60;
add.s32 r25, r10, r24;
barrier.sync 0;
and.b32 r26, r8, 3840;
add.s32 r27, r25, r26;
st.shared.f32 [r27], f131;
st.shared.f32 [r27+64], f144;
st.shared.f32 [r27+128], f154;
st.shared.f32 [r27+192], f164;
barrier.sync 0;
mad.lo.s32 r28, r22, -12, r27;
ld.shared.f32 f168, [r28];
ld.shared.f32 f169, [r28+1024];
ld.shared.f32 f170, [r28+2048];
ld.shared.f32 f171, [r28+3072];
barrier.sync 0;
st.shared.f32 [r27], f132;
st.shared.f32 [r27+64], f147;
st.shared.f32 [r27+128], f157;
st.shared.f32 [r27+192], f167;
barrier.sync 0;
ld.shared.f32 f172, [r28];
ld.shared.f32 f173, [r28+1024];
ld.shared.f32 f174, [r28+2048];
ld.shared.f32 f175, [r28+3072];
add.f32 f176, f168, f170;
add.f32 f177, f172, f174;
sub.f32 f178, f168, f170;
sub.f32 f179, f172, f174;
add.f32 f180, f169, f171;
add.f32 f181, f173, f175;
sub.f32 f182, f169, f171;
sub.f32 f183, f173, f175;
add.f32 f184, f176, f180;
add.f32 f185, f177, f181;
sub.f32 f186, f176, f180;
sub.f32 f187, f177, f181;
sub.f32 f188, f178, f183;
add.f32 f189, f179, f182;
add.f32 f190, f178, f183;
sub.f32 f191, f179, f182;
and.b32 r29, r5, 192;
bfe.u32 r30, r5, 6, 2;
mul.wide.u32 rd12, r30, 8;
mov.u64 rd13, %12;
add.s64 rd14, rd13, rd12;
ld.global.v2.f32 {f192, f193}, [rd14];
mul.f32 f196, f189, f193;
fma.rn.f32 f197, f192, f188, f196;
mul.f32 f198, f188, f193;
mul.f32 f199, f192, f189;
sub.f32 f200, f199, f198;
mul.f32 f201, f192, f192;
mul.f32 f202, f193, f193;
sub.f32 f203, f201, f202;
mul.f32 f204, f193, f192;
fma.rn.f32 f205, f193, f192, f204;
mul.f32 f206, f187, f205;
fma.rn.f32 f207, f203, f186, f206;
mul.f32 f208, f186, f205;
mul.f32 f209, f203, f187;
sub.f32 f210, f209, f208;
mul.f32 f211, f192, f203;
mul.f32 f212, f193, f205;
sub.f32 f213, f211, f212;
mul.f32 f214, f192, f205;
fma.rn.f32 f215, f193, f203, f214;
mul.f32 f216, f191, f215;
fma.rn.f32 f217, f213, f190, f216;
mul.f32 f218, f190, f215;
mul.f32 f219, f213, f191;
sub.f32 f220, f219, f218;
and.b32 r31, r16, 252;
add.s32 r32, r10, r31;
barrier.sync 0;
and.b32 r33, r8, 3072;
add.s32 r34, r32, r33;
st.shared.f32 [r34], f184;
st.shared.f32 [r34+256], f197;
st.shared.f32 [r34+512], f207;
st.shared.f32 [r34+768], f217;
barrier.sync 0;
mad.lo.s32 r35, r29, -12, r34;
ld.shared.f32 f221, [r35];
ld.shared.f32 f222, [r35+1024];
ld.shared.f32 f223, [r35+2048];
ld.shared.f32 f224, [r35+3072];
barrier.sync 0;
st.shared.f32 [r34], f185;
st.shared.f32 [r34+256], f200;
st.shared.f32 [r34+512], f210;
st.shared.f32 [r34+768], f220;
barrier.sync 0;
ld.shared.f32 f225, [r35];
ld.shared.f32 f226, [r35+1024];
ld.shared.f32 f227, [r35+2048];
ld.shared.f32 f228, [r35+3072];
add.f32 f229, f221, f223;
add.f32 f230, f225, f227;
sub.f32 f231, f221, f223;
sub.f32 f232, f225, f227;
add.f32 f233, f222, f224;
add.f32 f234, f226, f228;
sub.f32 f235, f222, f224;
sub.f32 f236, f226, f228;
add.f32 %0, f229, f233;
add.f32 %1, f230, f234;
add.f32 %3, f232, f235;
sub.f32 %2, f231, f236;
sub.f32 %4, f229, f233;
sub.f32 %5, f230, f234;
sub.f32 %7, f232, f235;
add.f32 %6, f231, f236;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_1024), "l"(lut_sp_4_256), "l"(lut_sp_4_64), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<291, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<202>;
.reg .b32 r<70>;
.reg .b64 rd<30>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 13;
mov.u32 r3, %4;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
sub.f32 f9, %14, %16;
sub.f32 f10, %15, %17;
shl.b32 r6, r5, 4;
and.b32 r7, r6, -8192;
add.s32 r8, r4, r7;
shl.b32 r9, r5, 3;
cvt.u64.u32 rd2, r9;
and.b64 rd3, rd2, 4088;
mov.u64 rd4, %5;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f11, f12}, [rd5];
mul.f32 f15, f10, f12;
mul.f32 f16, f9, f12;
mul.f32 f17, f11, f10;
barrier.sync 0;
and.b32 r10, r6, 8176;
add.s32 r11, r8, r10;
add.f32 f18, %15, %17;
add.f32 f19, %14, %16;
st.shared.v2.f32 [r11], {f19, f18};
sub.f32 f20, f17, f16;
fma.rn.f32 f21, f11, f9, f15;
st.shared.v2.f32 [r11+8], {f21, f20};
barrier.sync 0;
and.b32 r12, r9, 4088;
sub.s32 r13, r11, r12;
ld.shared.v2.f32 {f22, f23}, [r13];
ld.shared.v2.f32 {f26, f27}, [r13+4096];
sub.f32 f30, f22, f26;
sub.f32 f31, f23, f27;
bfe.u32 r14, r5, 1, 8;
mul.wide.u32 rd6, r14, 8;
mov.u64 rd7, %6;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f32, f33}, [rd8];
mul.f32 f36, f31, f33;
mul.f32 f37, f30, f33;
mul.f32 f38, f32, f31;
and.b32 r15, r9, 8;
add.s32 r16, r8, r15;
barrier.sync 0;
and.b32 r17, r6, 8160;
add.s32 r18, r16, r17;
add.f32 f39, f23, f27;
add.f32 f40, f22, f26;
st.shared.v2.f32 [r18], {f40, f39};
fma.rn.f32 f41, f32, f30, f36;
sub.f32 f42, f38, f37;
st.shared.v2.f32 [r18+16], {f41, f42};
barrier.sync 0;
and.b32 r19, r9, 4080;
sub.s32 r20, r18, r19;
ld.shared.v2.f32 {f43, f44}, [r20];
ld.shared.v2.f32 {f47, f48}, [r20+4096];
sub.f32 f51, f43, f47;
sub.f32 f52, f44, f48;
bfe.u32 r21, r5, 2, 7;
mul.wide.u32 rd9, r21, 8;
mov.u64 rd10, %7;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f53, f54}, [rd11];
mul.f32 f57, f52, f54;
mul.f32 f58, f51, f54;
mul.f32 f59, f53, f52;
and.b32 r22, r9, 24;
add.s32 r23, r8, r22;
barrier.sync 0;
and.b32 r24, r6, 8128;
add.s32 r25, r23, r24;
add.f32 f60, f44, f48;
add.f32 f61, f43, f47;
st.shared.v2.f32 [r25], {f61, f60};
fma.rn.f32 f62, f53, f51, f57;
sub.f32 f63, f59, f58;
st.shared.v2.f32 [r25+32], {f62, f63};
barrier.sync 0;
and.b32 r26, r9, 4064;
sub.s32 r27, r25, r26;
ld.shared.v2.f32 {f64, f65}, [r27];
ld.shared.v2.f32 {f68, f69}, [r27+4096];
sub.f32 f72, f64, f68;
sub.f32 f73, f65, f69;
and.b32 r28, r5, 504;
cvt.u64.u32 rd12, r28;
mov.u64 rd13, %8;
add.s64 rd14, rd13, rd12;
ld.global.v2.f32 {f74, f75}, [rd14];
mul.f32 f78, f73, f75;
mul.f32 f79, f72, f75;
mul.f32 f80, f74, f73;
and.b32 r29, r9, 56;
add.s32 r30, r8, r29;
barrier.sync 0;
and.b32 r31, r6, 8064;
add.s32 r32, r30, r31;
add.f32 f81, f65, f69;
add.f32 f82, f64, f68;
st.shared.v2.f32 [r32], {f82, f81};
fma.rn.f32 f83, f74, f72, f78;
sub.f32 f84, f80, f79;
st.shared.v2.f32 [r32+64], {f83, f84};
barrier.sync 0;
and.b32 r33, r9, 4032;
sub.s32 r34, r32, r33;
ld.shared.v2.f32 {f85, f86}, [r34];
ld.shared.v2.f32 {f89, f90}, [r34+4096];
sub.f32 f93, f85, f89;
sub.f32 f94, f86, f90;
bfe.u32 r35, r5, 4, 5;
mul.wide.u32 rd15, r35, 8;
mov.u64 rd16, %9;
add.s64 rd17, rd16, rd15;
ld.global.v2.f32 {f95, f96}, [rd17];
mul.f32 f99, f94, f96;
mul.f32 f100, f93, f96;
mul.f32 f101, f95, f94;
and.b32 r36, r9, 120;
add.s32 r37, r8, r36;
barrier.sync 0;
and.b32 r38, r6, 7936;
add.s32 r39, r37, r38;
add.f32 f102, f86, f90;
add.f32 f103, f85, f89;
st.shared.v2.f32 [r39], {f103, f102};
fma.rn.f32 f104, f95, f93, f99;
sub.f32 f105, f101, f100;
st.shared.v2.f32 [r39+128], {f104, f105};
barrier.sync 0;
and.b32 r40, r9, 3968;
sub.s32 r41, r39, r40;
ld.shared.v2.f32 {f106, f107}, [r41];
ld.shared.v2.f32 {f110, f111}, [r41+4096];
sub.f32 f114, f106, f110;
sub.f32 f115, f107, f111;
bfe.u32 r42, r5, 5, 4;
mul.wide.u32 rd18, r42, 8;
mov.u64 rd19, %10;
add.s64 rd20, rd19, rd18;
ld.global.v2.f32 {f116, f117}, [rd20];
mul.f32 f120, f115, f117;
mul.f32 f121, f114, f117;
mul.f32 f122, f116, f115;
and.b32 r43, r9, 248;
add.s32 r44, r8, r43;
barrier.sync 0;
and.b32 r45, r6, 7680;
add.s32 r46, r44, r45;
add.f32 f123, f107, f111;
add.f32 f124, f106, f110;
st.shared.v2.f32 [r46], {f124, f123};
fma.rn.f32 f125, f116, f114, f120;
sub.f32 f126, f122, f121;
st.shared.v2.f32 [r46+256], {f125, f126};
barrier.sync 0;
and.b32 r47, r9, 3840;
sub.s32 r48, r46, r47;
ld.shared.v2.f32 {f127, f128}, [r48];
ld.shared.v2.f32 {f131, f132}, [r48+4096];
sub.f32 f135, f127, f131;
sub.f32 f136, f128, f132;
bfe.u32 r49, r5, 6, 3;
mul.wide.u32 rd21, r49, 8;
mov.u64 rd22, %11;
add.s64 rd23, rd22, rd21;
ld.global.v2.f32 {f137, f138}, [rd23];
mul.f32 f141, f136, f138;
mul.f32 f142, f135, f138;
mul.f32 f143, f137, f136;
and.b32 r50, r9, 504;
add.s32 r51, r8, r50;
barrier.sync 0;
and.b32 r52, r6, 7168;
add.s32 r53, r51, r52;
add.f32 f144, f128, f132;
add.f32 f145, f127, f131;
st.shared.v2.f32 [r53], {f145, f144};
fma.rn.f32 f146, f137, f135, f141;
sub.f32 f147, f143, f142;
st.shared.v2.f32 [r53+512], {f146, f147};
barrier.sync 0;
and.b32 r54, r9, 3584;
sub.s32 r55, r53, r54;
ld.shared.v2.f32 {f148, f149}, [r55];
ld.shared.v2.f32 {f152, f153}, [r55+4096];
sub.f32 f156, f148, f152;
sub.f32 f157, f149, f153;
bfe.u32 r56, r5, 7, 2;
mul.wide.u32 rd24, r56, 8;
mov.u64 rd25, %12;
add.s64 rd26, rd25, rd24;
ld.global.v2.f32 {f158, f159}, [rd26];
mul.f32 f162, f157, f159;
mul.f32 f163, f156, f159;
mul.f32 f164, f158, f157;
and.b32 r57, r9, 1016;
add.s32 r58, r8, r57;
barrier.sync 0;
and.b32 r59, r6, 6144;
add.s32 r60, r58, r59;
add.f32 f165, f149, f153;
add.f32 f166, f148, f152;
st.shared.v2.f32 [r60], {f166, f165};
fma.rn.f32 f167, f158, f156, f162;
sub.f32 f168, f164, f163;
st.shared.v2.f32 [r60+1024], {f167, f168};
barrier.sync 0;
and.b32 r61, r9, 3072;
sub.s32 r62, r60, r61;
ld.shared.v2.f32 {f169, f170}, [r62];
ld.shared.v2.f32 {f173, f174}, [r62+4096];
sub.f32 f177, f169, f173;
sub.f32 f178, f170, f174;
bfe.u32 r63, r5, 8, 1;
mul.wide.u32 rd27, r63, 8;
mov.u64 rd28, %13;
add.s64 rd29, rd28, rd27;
ld.global.v2.f32 {f179, f180}, [rd29];
mul.f32 f183, f178, f180;
mul.f32 f184, f177, f180;
mul.f32 f185, f179, f178;
and.b32 r64, r9, 2040;
add.s32 r65, r8, r64;
barrier.sync 0;
and.b32 r66, r6, 4096;
add.s32 r67, r65, r66;
add.f32 f186, f170, f174;
add.f32 f187, f169, f173;
st.shared.v2.f32 [r67], {f187, f186};
fma.rn.f32 f188, f179, f177, f183;
sub.f32 f189, f185, f184;
st.shared.v2.f32 [r67+2048], {f188, f189};
barrier.sync 0;
and.b32 r68, r9, 2048;
sub.s32 r69, r67, r68;
ld.shared.v2.f32 {f190, f191}, [r69];
ld.shared.v2.f32 {f194, f195}, [r69+4096];
add.f32 %1, f191, f195;
add.f32 %0, f190, f194;
sub.f32 %3, f191, f195;
sub.f32 %2, f190, f194;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_1024), "l"(lut_sp_2_512), "l"(lut_sp_2_256), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<292, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<166>;
.reg .b32 r<70>;
.reg .b64 rd<30>;
mov.u32 r1, %tid.y;
shl.b32 r2, r1, 12;
mov.u32 r3, %4;
add.s32 r4, r3, r2;
mov.u32 r5, %tid.x;
add.f32 f9, %14, %16;
add.f32 f10, %15, %17;
sub.f32 f11, %14, %16;
sub.f32 f12, %15, %17;
shl.b32 r6, r5, 3;
cvt.u64.u32 rd2, r6;
and.b64 rd3, rd2, 4088;
mov.u64 rd4, %5;
add.s64 rd5, rd4, rd3;
ld.global.v2.f32 {f13, f14}, [rd5];
mul.f32 f17, f12, f14;
fma.rn.f32 f18, f13, f11, f17;
mul.f32 f19, f11, f14;
mul.f32 f20, f13, f12;
sub.f32 f21, f20, f19;
and.b32 r7, r6, -4096;
add.s32 r8, r4, r7;
barrier.sync 0;
and.b32 r9, r6, 4088;
add.s32 r10, r8, r9;
st.shared.v2.f32 [r10], {f9, f18};
barrier.sync 0;
shl.b32 r11, r5, 2;
and.b32 r12, r11, 2044;
sub.s32 r13, r10, r12;
ld.shared.f32 f22, [r13];
ld.shared.f32 f23, [r13+2048];
barrier.sync 0;
st.shared.v2.f32 [r10], {f10, f21};
barrier.sync 0;
ld.shared.f32 f24, [r13];
ld.shared.f32 f25, [r13+2048];
add.f32 f26, f22, f23;
add.f32 f27, f24, f25;
sub.f32 f28, f22, f23;
sub.f32 f29, f24, f25;
bfe.u32 r14, r5, 1, 8;
mul.wide.u32 rd6, r14, 8;
mov.u64 rd7, %6;
add.s64 rd8, rd7, rd6;
ld.global.v2.f32 {f30, f31}, [rd8];
mul.f32 f34, f29, f31;
fma.rn.f32 f35, f30, f28, f34;
mul.f32 f36, f28, f31;
mul.f32 f37, f30, f29;
sub.f32 f38, f37, f36;
and.b32 r15, r11, 4;
add.s32 r16, r8, r15;
barrier.sync 0;
and.b32 r17, r6, 4080;
add.s32 r18, r16, r17;
st.shared.f32 [r18], f26;
st.shared.f32 [r18+8], f35;
barrier.sync 0;
and.b32 r19, r11, 2040;
sub.s32 r20, r18, r19;
ld.shared.f32 f39, [r20];
ld.shared.f32 f40, [r20+2048];
barrier.sync 0;
st.shared.f32 [r18], f27;
st.shared.f32 [r18+8], f38;
barrier.sync 0;
ld.shared.f32 f41, [r20];
ld.shared.f32 f42, [r20+2048];
add.f32 f43, f39, f40;
add.f32 f44, f41, f42;
sub.f32 f45, f39, f40;
sub.f32 f46, f41, f42;
bfe.u32 r21, r5, 2, 7;
mul.wide.u32 rd9, r21, 8;
mov.u64 rd10, %7;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f47, f48}, [rd11];
mul.f32 f51, f46, f48;
fma.rn.f32 f52, f47, f45, f51;
mul.f32 f53, f45, f48;
mul.f32 f54, f47, f46;
sub.f32 f55, f54, f53;
and.b32 r22, r11, 12;
add.s32 r23, r8, r22;
barrier.sync 0;
and.b32 r24, r6, 4064;
add.s32 r25, r23, r24;
st.shared.f32 [r25], f43;
st.shared.f32 [r25+16], f52;
barrier.sync 0;
and.b32 r26, r11, 2032;
sub.s32 r27, r25, r26;
ld.shared.f32 f56, [r27];
ld.shared.f32 f57, [r27+2048];
barrier.sync 0;
st.shared.f32 [r25], f44;
st.shared.f32 [r25+16], f55;
barrier.sync 0;
ld.shared.f32 f58, [r27];
ld.shared.f32 f59, [r27+2048];
add.f32 f60, f56, f57;
add.f32 f61, f58, f59;
sub.f32 f62, f56, f57;
sub.f32 f63, f58, f59;
and.b32 r28, r5, 504;
cvt.u64.u32 rd12, r28;
mov.u64 rd13, %8;
add.s64 rd14, rd13, rd12;
ld.global.v2.f32 {f64, f65}, [rd14];
mul.f32 f68, f63, f65;
fma.rn.f32 f69, f64, f62, f68;
mul.f32 f70, f62, f65;
mul.f32 f71, f64, f63;
sub.f32 f72, f71, f70;
and.b32 r29, r11, 28;
add.s32 r30, r8, r29;
barrier.sync 0;
and.b32 r31, r6, 4032;
add.s32 r32, r30, r31;
st.shared.f32 [r32], f60;
st.shared.f32 [r32+32], f69;
barrier.sync 0;
and.b32 r33, r11, 2016;
sub.s32 r34, r32, r33;
ld.shared.f32 f73, [r34];
ld.shared.f32 f74, [r34+2048];
barrier.sync 0;
st.shared.f32 [r32], f61;
st.shared.f32 [r32+32], f72;
barrier.sync 0;
ld.shared.f32 f75, [r34];
ld.shared.f32 f76, [r34+2048];
add.f32 f77, f73, f74;
add.f32 f78, f75, f76;
sub.f32 f79, f73, f74;
sub.f32 f80, f75, f76;
bfe.u32 r35, r5, 4, 5;
mul.wide.u32 rd15, r35, 8;
mov.u64 rd16, %9;
add.s64 rd17, rd16, rd15;
ld.global.v2.f32 {f81, f82}, [rd17];
mul.f32 f85, f80, f82;
fma.rn.f32 f86, f81, f79, f85;
mul.f32 f87, f79, f82;
mul.f32 f88, f81, f80;
sub.f32 f89, f88, f87;
and.b32 r36, r11, 60;
add.s32 r37, r8, r36;
barrier.sync 0;
and.b32 r38, r6, 3968;
add.s32 r39, r37, r38;
st.shared.f32 [r39], f77;
st.shared.f32 [r39+64], f86;
barrier.sync 0;
and.b32 r40, r11, 1984;
sub.s32 r41, r39, r40;
ld.shared.f32 f90, [r41];
ld.shared.f32 f91, [r41+2048];
barrier.sync 0;
st.shared.f32 [r39], f78;
st.shared.f32 [r39+64], f89;
barrier.sync 0;
ld.shared.f32 f92, [r41];
ld.shared.f32 f93, [r41+2048];
add.f32 f94, f90, f91;
add.f32 f95, f92, f93;
sub.f32 f96, f90, f91;
sub.f32 f97, f92, f93;
bfe.u32 r42, r5, 5, 4;
mul.wide.u32 rd18, r42, 8;
mov.u64 rd19, %10;
add.s64 rd20, rd19, rd18;
ld.global.v2.f32 {f98, f99}, [rd20];
mul.f32 f102, f97, f99;
fma.rn.f32 f103, f98, f96, f102;
mul.f32 f104, f96, f99;
mul.f32 f105, f98, f97;
sub.f32 f106, f105, f104;
and.b32 r43, r11, 124;
add.s32 r44, r8, r43;
barrier.sync 0;
and.b32 r45, r6, 3840;
add.s32 r46, r44, r45;
st.shared.f32 [r46], f94;
st.shared.f32 [r46+128], f103;
barrier.sync 0;
and.b32 r47, r11, 1920;
sub.s32 r48, r46, r47;
ld.shared.f32 f107, [r48];
ld.shared.f32 f108, [r48+2048];
barrier.sync 0;
st.shared.f32 [r46], f95;
st.shared.f32 [r46+128], f106;
barrier.sync 0;
ld.shared.f32 f109, [r48];
ld.shared.f32 f110, [r48+2048];
add.f32 f111, f107, f108;
add.f32 f112, f109, f110;
sub.f32 f113, f107, f108;
sub.f32 f114, f109, f110;
bfe.u32 r49, r5, 6, 3;
mul.wide.u32 rd21, r49, 8;
mov.u64 rd22, %11;
add.s64 rd23, rd22, rd21;
ld.global.v2.f32 {f115, f116}, [rd23];
mul.f32 f119, f114, f116;
fma.rn.f32 f120, f115, f113, f119;
mul.f32 f121, f113, f116;
mul.f32 f122, f115, f114;
sub.f32 f123, f122, f121;
and.b32 r50, r11, 252;
add.s32 r51, r8, r50;
barrier.sync 0;
and.b32 r52, r6, 3584;
add.s32 r53, r51, r52;
st.shared.f32 [r53], f111;
st.shared.f32 [r53+256], f120;
barrier.sync 0;
and.b32 r54, r11, 1792;
sub.s32 r55, r53, r54;
ld.shared.f32 f124, [r55];
ld.shared.f32 f125, [r55+2048];
barrier.sync 0;
st.shared.f32 [r53], f112;
st.shared.f32 [r53+256], f123;
barrier.sync 0;
ld.shared.f32 f126, [r55];
ld.shared.f32 f127, [r55+2048];
add.f32 f128, f124, f125;
add.f32 f129, f126, f127;
sub.f32 f130, f124, f125;
sub.f32 f131, f126, f127;
bfe.u32 r56, r5, 7, 2;
mul.wide.u32 rd24, r56, 8;
mov.u64 rd25, %12;
add.s64 rd26, rd25, rd24;
ld.global.v2.f32 {f132, f133}, [rd26];
mul.f32 f136, f131, f133;
fma.rn.f32 f137, f132, f130, f136;
mul.f32 f138, f130, f133;
mul.f32 f139, f132, f131;
sub.f32 f140, f139, f138;
and.b32 r57, r11, 508;
add.s32 r58, r8, r57;
barrier.sync 0;
and.b32 r59, r6, 3072;
add.s32 r60, r58, r59;
st.shared.f32 [r60], f128;
st.shared.f32 [r60+512], f137;
barrier.sync 0;
and.b32 r61, r11, 1536;
sub.s32 r62, r60, r61;
ld.shared.f32 f141, [r62];
ld.shared.f32 f142, [r62+2048];
barrier.sync 0;
st.shared.f32 [r60], f129;
st.shared.f32 [r60+512], f140;
barrier.sync 0;
ld.shared.f32 f143, [r62];
ld.shared.f32 f144, [r62+2048];
add.f32 f145, f141, f142;
add.f32 f146, f143, f144;
sub.f32 f147, f141, f142;
sub.f32 f148, f143, f144;
bfe.u32 r63, r5, 8, 1;
mul.wide.u32 rd27, r63, 8;
mov.u64 rd28, %13;
add.s64 rd29, rd28, rd27;
ld.global.v2.f32 {f149, f150}, [rd29];
mul.f32 f153, f148, f150;
fma.rn.f32 f154, f149, f147, f153;
mul.f32 f155, f147, f150;
mul.f32 f156, f149, f148;
sub.f32 f157, f156, f155;
and.b32 r64, r11, 1020;
add.s32 r65, r8, r64;
barrier.sync 0;
and.b32 r66, r6, 2048;
add.s32 r67, r65, r66;
st.shared.f32 [r67], f145;
st.shared.f32 [r67+1024], f154;
barrier.sync 0;
and.b32 r68, r11, 1024;
sub.s32 r69, r67, r68;
ld.shared.f32 f158, [r69];
ld.shared.f32 f159, [r69+2048];
barrier.sync 0;
st.shared.f32 [r67], f146;
st.shared.f32 [r67+1024], f157;
barrier.sync 0;
ld.shared.f32 f160, [r69];
ld.shared.f32 f161, [r69+2048];
add.f32 %0, f158, f159;
add.f32 %1, f160, f161;
sub.f32 %2, f158, f159;
sub.f32 %3, f160, f161;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_1024), "l"(lut_sp_2_512), "l"(lut_sp_2_256), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y));
};


#endif
